1/* Optimized strchr implementation for PowerPC64/POWER8. 2 Copyright (C) 2016-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifdef USE_AS_STRCHRNUL 22# ifndef STRCHRNUL 23# define FUNC_NAME __strchrnul 24# else 25# define FUNC_NAME STRCHRNUL 26# endif 27#else 28# ifndef STRCHR 29# define FUNC_NAME strchr 30# else 31# define FUNC_NAME STRCHR 32# endif 33#endif /* !USE_AS_STRCHRNUL */ 34 35/* int [r3] strchr (char *s [r3], int c [r4]) */ 36 .machine power8 37ENTRY_TOCLESS (FUNC_NAME) 38 CALL_MCOUNT 2 39 dcbt 0,r3 40 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ 41 cmpdi cr7,r4,0 42 ld r12,0(r8) /* Load doubleword from memory. */ 43 li r0,0 /* Doubleword with null chars to use 44 with cmpb. */ 45 46 rlwinm r6,r3,3,26,28 /* Calculate padding. */ 47 48 beq cr7,L(null_match) 49 50 /* Replicate byte to doubleword. */ 51 insrdi r4,r4,8,48 52 insrdi r4,r4,16,32 53 insrdi r4,r4,32,0 54 55 /* Now r4 has a doubleword of c bytes and r0 has 56 a doubleword of null bytes. */ 57 58 cmpb r10,r12,r4 /* Compare each byte against c byte. */ 59 cmpb r11,r12,r0 /* Compare each byte against null byte. */ 60 61 /* Move the doublewords left and right to discard the bits that are 62 not part of the string and bring them back as zeros. */ 63#ifdef __LITTLE_ENDIAN__ 64 srd r10,r10,r6 65 srd r11,r11,r6 66 sld r10,r10,r6 67 sld r11,r11,r6 68#else 69 sld r10,r10,r6 70 sld r11,r11,r6 71 srd r10,r10,r6 72 srd r11,r11,r6 73#endif 74 or r5,r10,r11 /* OR the results to speed things up. */ 75 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes 76 have been found. */ 77 bne cr7,L(done) 78 79 mtcrf 0x01,r8 80 81 /* Are we now aligned to a doubleword boundary? If so, skip to 82 the main loop. Otherwise, go through the alignment code. */ 83 84 bt 28,L(loop) 85 86 /* Handle WORD2 of pair. */ 87 ldu r12,8(r8) 88 cmpb r10,r12,r4 89 cmpb r11,r12,r0 90 or r5,r10,r11 91 cmpdi cr7,r5,0 92 bne cr7,L(done) 93 b L(loop) /* We branch here (rather than falling through) 94 to skip the nops due to heavy alignment 95 of the loop below. */ 96 97 .p2align 5 98L(loop): 99 /* Load two doublewords, compare and merge in a 100 single register for speed. This is an attempt 101 to speed up the null-checking process for bigger strings. */ 102 ld r12,8(r8) 103 ldu r9,16(r8) 104 cmpb r10,r12,r4 105 cmpb r11,r12,r0 106 cmpb r6,r9,r4 107 cmpb r7,r9,r0 108 or r5,r10,r11 109 or r9,r6,r7 110 or r12,r5,r9 111 cmpdi cr7,r12,0 112 beq cr7,L(vector) 113 /* OK, one (or both) of the doublewords contains a c/null byte. Check 114 the first doubleword and decrement the address in case the first 115 doubleword really contains a c/null byte. */ 116 117 cmpdi cr6,r5,0 118 addi r8,r8,-8 119 bne cr6,L(done) 120 121 /* The c/null byte must be in the second doubleword. Adjust the 122 address again and move the result of cmpb to r10 so we can calculate 123 the pointer. */ 124 125 mr r10,r6 126 mr r11,r7 127 addi r8,r8,8 128#ifdef USE_AS_STRCHRNUL 129 mr r5, r9 130#endif 131 /* r10/r11 have the output of the cmpb instructions, that is, 132 0xff in the same position as the c/null byte in the original 133 doubleword from the string. Use that to calculate the pointer. */ 134L(done): 135#ifdef USE_AS_STRCHRNUL 136 mr r10, r5 137#endif 138#ifdef __LITTLE_ENDIAN__ 139 addi r3,r10,-1 140 andc r3,r3,r10 141 popcntd r0,r3 142# ifndef USE_AS_STRCHRNUL 143 addi r4,r11,-1 144 andc r4,r4,r11 145 cmpld cr7,r3,r4 146 bgt cr7,L(no_match) 147# endif 148#else 149 cntlzd r0,r10 /* Count leading zeros before c matches. */ 150# ifndef USE_AS_STRCHRNUL 151 cmpld cr7,r11,r10 152 bgt cr7,L(no_match) 153# endif 154#endif 155 srdi r0,r0,3 /* Convert leading zeros to bytes. */ 156 add r3,r8,r0 /* Return address of the matching c byte 157 or null in case c was not found. */ 158 blr 159 160 /* Check the first 32B in GPR's and move to vectorized loop. */ 161 .p2align 5 162L(vector): 163 addi r3, r8, 8 164 andi. r10, r3, 31 165 bne cr0, L(loop) 166 vspltisb v0, 0 167 /* Precompute vbpermq constant. */ 168 vspltisb v10, 3 169 lvsl v11, r0, r0 170 vslb v10, v11, v10 171 mtvrd v1, r4 172 li r5, 16 173 vspltb v1, v1, 7 174 /* Compare 32 bytes in each loop. */ 175L(continue): 176 lvx v4, 0, r3 177 lvx v5, r3, r5 178 vcmpequb v2, v0, v4 179 vcmpequb v3, v0, v5 180 vcmpequb v6, v1, v4 181 vcmpequb v7, v1, v5 182 vor v8, v2, v3 183 vor v9, v6, v7 184 vor v11, v8, v9 185 vcmpequb. v11, v0, v11 186 addi r3, r3, 32 187 blt cr6, L(continue) 188 /* One (or both) of the quadwords contains a c/null byte. */ 189 addi r3, r3, -32 190#ifndef USE_AS_STRCHRNUL 191 vcmpequb. v11, v0, v9 192 blt cr6, L(no_match) 193#endif 194 /* Permute the first bit of each byte into bits 48-63. */ 195 vbpermq v2, v2, v10 196 vbpermq v3, v3, v10 197 vbpermq v6, v6, v10 198 vbpermq v7, v7, v10 199 /* Shift each component into its correct position for merging. */ 200#ifdef __LITTLE_ENDIAN__ 201 vsldoi v3, v3, v3, 2 202 vsldoi v7, v7, v7, 2 203#else 204 vsldoi v2, v2, v2, 6 205 vsldoi v3, v3, v3, 4 206 vsldoi v6, v6, v6, 6 207 vsldoi v7, v7, v7, 4 208#endif 209 210 /* Merge the results and move to a GPR. */ 211 vor v1, v3, v2 212 vor v2, v6, v7 213 vor v4, v1, v2 214 mfvrd r5, v4 215#ifdef __LITTLE_ENDIAN__ 216 addi r6, r5, -1 217 andc r6, r6, r5 218 popcntd r6, r6 219#else 220 cntlzd r6, r5 /* Count leading zeros before the match. */ 221#endif 222 add r3, r3, r6 /* Compute final length. */ 223 /* Return NULL if null found before c. */ 224#ifndef USE_AS_STRCHRNUL 225 lbz r4, 0(r3) 226 cmpdi cr7, r4, 0 227 beq cr7, L(no_match) 228#endif 229 blr 230 231#ifndef USE_AS_STRCHRNUL 232 .align 4 233L(no_match): 234 li r3,0 235 blr 236#endif 237 238/* We are here because strchr was called with a null byte. */ 239 .align 4 240L(null_match): 241 /* r0 has a doubleword of null bytes. */ 242 243 cmpb r5,r12,r0 /* Compare each byte against null bytes. */ 244 245 /* Move the doublewords left and right to discard the bits that are 246 not part of the string and bring them back as zeros. */ 247#ifdef __LITTLE_ENDIAN__ 248 srd r5,r5,r6 249 sld r5,r5,r6 250#else 251 sld r5,r5,r6 252 srd r5,r5,r6 253#endif 254 cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes 255 have been found. */ 256 bne cr7,L(done_null) 257 258 mtcrf 0x01,r8 259 260 /* Are we now aligned to a quadword boundary? If so, skip to 261 the main loop. Otherwise, go through the alignment code. */ 262 263 bt 28,L(loop_null) 264 265 /* Handle WORD2 of pair. */ 266 ldu r12,8(r8) 267 cmpb r5,r12,r0 268 cmpdi cr7,r5,0 269 bne cr7,L(done_null) 270 b L(loop_null) /* We branch here (rather than falling through) 271 to skip the nops due to heavy alignment 272 of the loop below. */ 273 274 /* Main loop to look for the end of the string. Since it's a 275 small loop (< 8 instructions), align it to 32-bytes. */ 276 .p2align 5 277L(loop_null): 278 /* Load two doublewords, compare and merge in a 279 single register for speed. This is an attempt 280 to speed up the null-checking process for bigger strings. */ 281 ld r12,8(r8) 282 ldu r11,16(r8) 283 cmpb r5,r12,r0 284 cmpb r10,r11,r0 285 or r6,r5,r10 286 cmpdi cr7,r6,0 287 beq cr7,L(vector1) 288 289 /* OK, one (or both) of the doublewords contains a null byte. Check 290 the first doubleword and decrement the address in case the first 291 doubleword really contains a null byte. */ 292 293 cmpdi cr6,r5,0 294 addi r8,r8,-8 295 bne cr6,L(done_null) 296 297 /* The null byte must be in the second doubleword. Adjust the address 298 again and move the result of cmpb to r10 so we can calculate the 299 pointer. */ 300 301 mr r5,r10 302 addi r8,r8,8 303 304 /* r5 has the output of the cmpb instruction, that is, it contains 305 0xff in the same position as the null byte in the original 306 doubleword from the string. Use that to calculate the pointer. */ 307L(done_null): 308#ifdef __LITTLE_ENDIAN__ 309 addi r0,r5,-1 310 andc r0,r0,r5 311 popcntd r0,r0 312#else 313 cntlzd r0,r5 /* Count leading zeros before the match. */ 314#endif 315 srdi r0,r0,3 /* Convert leading zeros to bytes. */ 316 add r3,r8,r0 /* Return address of the matching null byte. */ 317 blr 318 .p2align 5 319L(vector1): 320 addi r3, r8, 8 321 andi. r10, r3, 31 322 bne cr0, L(loop_null) 323 vspltisb v8, -1 324 vspltisb v0, 0 325 vspltisb v10, 3 326 lvsl v11, r0, r0 327 vslb v10, v11, v10 328 li r5, 16 329L(continue1): 330 lvx v4, 0, r3 331 lvx v5, r3, r5 332 vcmpequb v2, v0, v4 333 vcmpequb v3, v0, v5 334 vor v8, v2, v3 335 vcmpequb. v11, v0, v8 336 addi r3, r3, 32 337 blt cr6, L(continue1) 338 addi r3, r3, -32 339L(end1): 340 vbpermq v2, v2, v10 341 vbpermq v3, v3, v10 342 /* Shift each component into its correct position for merging. */ 343#ifdef __LITTLE_ENDIAN__ 344 vsldoi v3, v3, v3, 2 345#else 346 vsldoi v2, v2, v2, 6 347 vsldoi v3, v3, v3, 4 348#endif 349 350 /* Merge the results and move to a GPR. */ 351 vor v4, v3, v2 352 mfvrd r5, v4 353#ifdef __LITTLE_ENDIAN__ 354 addi r6, r5, -1 355 andc r6, r6, r5 356 popcntd r6, r6 357#else 358 cntlzd r6, r5 /* Count leading zeros before the match. */ 359#endif 360 add r3, r3, r6 /* Compute final length. */ 361 blr 362END (FUNC_NAME) 363 364#ifndef USE_AS_STRCHRNUL 365weak_alias (strchr, index) 366libc_hidden_builtin_def (strchr) 367#endif 368