1/* Optimized strcasestr implementation for PowerPC64/POWER8. 2 Copyright (C) 2016-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <locale-defines.h> 21 22/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ 23 24/* The performance gain is obtained by comparing 16 bytes. */ 25 26/* When the first char of r4 is hit ITERATIONS times in r3 27 fallback to default. */ 28#define ITERATIONS 64 29 30#ifndef STRCASESTR 31# define STRCASESTR __strcasestr 32#endif 33 34#ifndef STRLEN 35/* For builds without IFUNC support, local calls should be made to internal 36 GLIBC symbol (created by libc_hidden_builtin_def). */ 37# ifdef SHARED 38# define STRLEN __GI_strlen 39# else 40# define STRLEN strlen 41# endif 42#endif 43 44#ifndef STRNLEN 45/* For builds without IFUNC support, local calls should be made to internal 46 GLIBC symbol (created by libc_hidden_builtin_def). */ 47# ifdef SHARED 48# define STRNLEN __GI_strnlen 49# else 50# define STRNLEN __strnlen 51# endif 52#endif 53 54#ifndef STRCHR 55# ifdef SHARED 56# define STRCHR __GI_strchr 57# else 58# define STRCHR strchr 59# endif 60#endif 61 62/* Convert 16 bytes of v4 and reg to lowercase and compare. */ 63#define TOLOWER(reg) \ 64 vcmpgtub v6, v4, v1; \ 65 vcmpgtub v7, v2, v4; \ 66 vand v8, v7, v6; \ 67 vand v8, v8, v3; \ 68 vor v4, v8, v4; \ 69 vcmpgtub v6, reg, v1; \ 70 vcmpgtub v7, v2, reg; \ 71 vand v8, v7, v6; \ 72 vand v8, v8, v3; \ 73 vor reg, v8, reg; \ 74 vcmpequb. v6, reg, v4; 75 76#define FRAMESIZE (FRAME_MIN_SIZE+48) 77 .machine power8 78ENTRY (STRCASESTR, 4) 79 CALL_MCOUNT 2 80 mflr r0 /* Load link register LR to r0. */ 81 std r31, -8(r1) /* Save callers register r31. */ 82 std r30, -16(r1) /* Save callers register r30. */ 83 std r29, -24(r1) /* Save callers register r29. */ 84 std r28, -32(r1) /* Save callers register r28. */ 85 std r27, -40(r1) /* Save callers register r27. */ 86 std r0, 16(r1) /* Store the link register. */ 87 cfi_offset(r31, -8) 88 cfi_offset(r30, -16) 89 cfi_offset(r29, -24) 90 cfi_offset(r28, -32) 91 cfi_offset(r27, -40) 92 cfi_offset(lr, 16) 93 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ 94 cfi_adjust_cfa_offset(FRAMESIZE) 95 96 dcbt 0, r3 97 dcbt 0, r4 98 cmpdi cr7, r3, 0 /* Input validation. */ 99 beq cr7, L(retnull) 100 cmpdi cr7, r4, 0 101 beq cr7, L(retnull) 102 103 mr r29, r3 104 mr r30, r4 105 /* Load first byte from r4 and check if its null. */ 106 lbz r6, 0(r4) 107 cmpdi cr7, r6, 0 108 beq cr7, L(ret_r3) 109 110 ld r10, __libc_tsd_LOCALE@got@tprel(r2) 111 add r9, r10, __libc_tsd_LOCALE@tls 112 ld r9, 0(r9) 113 ld r9, LOCALE_CTYPE_TOUPPER(r9) 114 sldi r10, r6, 2 /* Convert to upper case. */ 115 lwzx r28, r9, r10 116 117 ld r10, __libc_tsd_LOCALE@got@tprel(r2) 118 add r11, r10, __libc_tsd_LOCALE@tls 119 ld r11, 0(r11) 120 ld r11, LOCALE_CTYPE_TOLOWER(r11) 121 sldi r10, r6, 2 /* Convert to lower case. */ 122 lwzx r27, r11, r10 123 124 /* Check if the first char is present. */ 125 mr r4, r27 126 bl STRCHR 127 nop 128 mr r5, r3 129 mr r3, r29 130 mr r29, r5 131 mr r4, r28 132 bl STRCHR 133 nop 134 cmpdi cr7, r29, 0 135 beq cr7, L(firstpos) 136 cmpdi cr7, r3, 0 137 beq cr7, L(skipcheck) 138 cmpw cr7, r3, r29 139 ble cr7, L(firstpos) 140 /* Move r3 to the first occurence. */ 141L(skipcheck): 142 mr r3, r29 143L(firstpos): 144 mr r29, r3 145 146 sldi r9, r27, 8 147 or r28, r9, r28 148 /* Reg r27 is used to count the number of iterations. */ 149 li r27, 0 150 /* If first char of search str is not present. */ 151 cmpdi cr7, r3, 0 152 ble cr7, L(end) 153 154 /* Find the length of pattern. */ 155 mr r3, r30 156 bl STRLEN 157 nop 158 159 cmpdi cr7, r3, 0 /* If search str is null. */ 160 beq cr7, L(ret_r3) 161 162 mr r31, r3 163 mr r4, r3 164 mr r3, r29 165 bl STRNLEN 166 nop 167 168 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ 169 blt cr7, L(retnull) 170 171 mr r3, r29 172 173 /* Locales not matching ASCII for single bytes. */ 174 ld r10, __libc_tsd_LOCALE@got@tprel(r2) 175 add r9, r10, __libc_tsd_LOCALE@tls 176 ld r9, 0(r9) 177 ld r7, 0(r9) 178 addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES 179 lwz r8, 0(r7) 180 cmpdi cr7, r8, 1 181 beq cr7, L(bytebybyte) 182 183 /* If len(r4) < 16 handle byte by byte. */ 184 /* For shorter strings we will not use vector registers. */ 185 cmpdi cr7, r31, 16 186 blt cr7, L(bytebybyte) 187 188 /* Comparison values used for TOLOWER. */ 189 /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ 190 vspltish v0, 0 191 vspltisb v5, 2 192 vspltisb v4, 4 193 vsl v3, v5, v4 194 vaddubm v1, v3, v3 195 vspltisb v5, 15 196 vaddubm v2, v5, v5 197 vaddubm v2, v1, v2 198 vspltisb v4, -3 199 vaddubm v2, v2, v4 200 201 /* 202 1. Load 16 bytes from r3 and r4 203 2. Check if there is null, If yes, proceed byte by byte path. 204 3. Else,Convert both to lowercase and compare. 205 4. If they are same proceed to 1. 206 5. If they dont match, find if first char of r4 is present in the 207 loaded 16 byte of r3. 208 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. 209 */ 210 211 mr r8, r3 /* Save r3 for future use. */ 212 mr r4, r30 /* Restore r4. */ 213 clrldi r10, r4, 60 214 lvx v5, 0, r4 /* Load 16 bytes from r4. */ 215 cmpdi cr7, r10, 0 216 beq cr7, L(begin2) 217 /* If r4 is unaligned, load another 16 bytes. */ 218#ifdef __LITTLE_ENDIAN__ 219 lvsr v7, 0, r4 220#else 221 lvsl v7, 0, r4 222#endif 223 addi r5, r4, 16 224 lvx v9, 0, r5 225#ifdef __LITTLE_ENDIAN__ 226 vperm v5, v9, v5, v7 227#else 228 vperm v5, v5, v9, v7 229#endif 230L(begin2): 231 lvx v4, 0, r3 232 vcmpequb. v7, v0, v4 /* Check for null. */ 233 beq cr6, L(nullchk6) 234 b L(trailcheck) 235 236 .align 4 237L(nullchk6): 238 clrldi r10, r3, 60 239 cmpdi cr7, r10, 0 240 beq cr7, L(next16) 241#ifdef __LITTLE_ENDIAN__ 242 lvsr v7, 0, r3 243#else 244 lvsl v7, 0, r3 245#endif 246 addi r5, r3, 16 247 /* If r3 is unaligned, load another 16 bytes. */ 248 lvx v10, 0, r5 249#ifdef __LITTLE_ENDIAN__ 250 vperm v4, v10, v4, v7 251#else 252 vperm v4, v4, v10, v7 253#endif 254L(next16): 255 vcmpequb. v6, v0, v5 /* Check for null. */ 256 beq cr6, L(nullchk) 257 b L(trailcheck) 258 259 .align 4 260L(nullchk): 261 vcmpequb. v6, v0, v4 262 beq cr6, L(nullchk1) 263 b L(retnull) 264 265 .align 4 266L(nullchk1): 267 /* Convert both v3 and v4 to lower. */ 268 TOLOWER(v5) 269 /* If both are same, branch to match. */ 270 blt cr6, L(match) 271 /* Find if the first char is present in next 15 bytes. */ 272#ifdef __LITTLE_ENDIAN__ 273 vspltb v6, v5, 15 274 vsldoi v7, v0, v4, 15 275#else 276 vspltb v6, v5, 0 277 vspltisb v7, 8 278 vslo v7, v4, v7 279#endif 280 vcmpequb v7, v6, v7 281 vcmpequb. v6, v0, v7 282 /* Shift r3 by 16 bytes and proceed. */ 283 blt cr6, L(shift16) 284 vclzd v8, v7 285#ifdef __LITTLE_ENDIAN__ 286 vspltb v6, v8, 15 287#else 288 vspltb v6, v8, 7 289#endif 290 vcmpequb. v6, v6, v1 291 /* Shift r3 by 8 bytes and proceed. */ 292 blt cr6, L(shift8) 293 b L(begin) 294 295 .align 4 296L(match): 297 /* There is a match of 16 bytes, check next bytes. */ 298 cmpdi cr7, r31, 16 299 mr r29, r3 300 beq cr7, L(ret_r3) 301 302L(secondmatch): 303 addi r3, r3, 16 304 addi r4, r4, 16 305 /* Load next 16 bytes of r3 and r4 and compare. */ 306 clrldi r10, r4, 60 307 cmpdi cr7, r10, 0 308 beq cr7, L(nextload) 309 /* Handle unaligned case. */ 310 vor v6, v9, v9 311 vcmpequb. v7, v0, v6 312 beq cr6, L(nullchk2) 313 b L(trailcheck) 314 315 .align 4 316L(nullchk2): 317#ifdef __LITTLE_ENDIAN__ 318 lvsr v7, 0, r4 319#else 320 lvsl v7, 0, r4 321#endif 322 addi r5, r4, 16 323 /* If r4 is unaligned, load another 16 bytes. */ 324 lvx v9, 0, r5 325#ifdef __LITTLE_ENDIAN__ 326 vperm v11, v9, v6, v7 327#else 328 vperm v11, v6, v9, v7 329#endif 330 b L(compare) 331 332 .align 4 333L(nextload): 334 lvx v11, 0, r4 335L(compare): 336 vcmpequb. v7, v0, v11 337 beq cr6, L(nullchk3) 338 b L(trailcheck) 339 340 .align 4 341L(nullchk3): 342 clrldi r10, r3, 60 343 cmpdi cr7, r10, 0 344 beq cr7, L(nextload1) 345 /* Handle unaligned case. */ 346 vor v4, v10, v10 347 vcmpequb. v7, v0, v4 348 beq cr6, L(nullchk4) 349 b L(retnull) 350 351 .align 4 352L(nullchk4): 353#ifdef __LITTLE_ENDIAN__ 354 lvsr v7, 0, r3 355#else 356 lvsl v7, 0, r3 357#endif 358 addi r5, r3, 16 359 /* If r3 is unaligned, load another 16 bytes. */ 360 lvx v10, 0, r5 361#ifdef __LITTLE_ENDIAN__ 362 vperm v4, v10, v4, v7 363#else 364 vperm v4, v4, v10, v7 365#endif 366 b L(compare1) 367 368 .align 4 369L(nextload1): 370 lvx v4, 0, r3 371L(compare1): 372 vcmpequb. v7, v0, v4 373 beq cr6, L(nullchk5) 374 b L(retnull) 375 376 .align 4 377L(nullchk5): 378 /* Convert both v3 and v4 to lower. */ 379 TOLOWER(v11) 380 /* If both are same, branch to secondmatch. */ 381 blt cr6, L(secondmatch) 382 /* Continue the search. */ 383 b L(begin) 384 385 .align 4 386L(trailcheck): 387 ld r10, __libc_tsd_LOCALE@got@tprel(r2) 388 add r11, r10, __libc_tsd_LOCALE@tls 389 ld r11, 0(r11) 390 ld r11, LOCALE_CTYPE_TOLOWER(r11) 391L(loop2): 392 lbz r5, 0(r3) /* Load byte from r3. */ 393 lbz r6, 0(r4) /* Load next byte from r4. */ 394 cmpdi cr7, r6, 0 /* Is it null? */ 395 beq cr7, L(updater3) 396 cmpdi cr7, r5, 0 /* Is it null? */ 397 beq cr7, L(retnull) /* If yes, return. */ 398 addi r3, r3, 1 399 addi r4, r4, 1 /* Increment r4. */ 400 sldi r10, r5, 2 /* Convert to lower case. */ 401 lwzx r10, r11, r10 402 sldi r7, r6, 2 /* Convert to lower case. */ 403 lwzx r7, r11, r7 404 cmpw cr7, r7, r10 /* Compare with byte from r4. */ 405 bne cr7, L(begin) 406 b L(loop2) 407 408 .align 4 409L(shift8): 410 addi r8, r8, 7 411 b L(begin) 412 .align 4 413L(shift16): 414 addi r8, r8, 15 415 .align 4 416L(begin): 417 addi r8, r8, 1 418 mr r3, r8 419 /* When our iterations exceed ITERATIONS,fall back to default. */ 420 addi r27, r27, 1 421 cmpdi cr7, r27, ITERATIONS 422 beq cr7, L(default) 423 mr r4, r30 /* Restore r4. */ 424 b L(begin2) 425 426 /* Handling byte by byte. */ 427 .align 4 428L(loop1): 429 mr r3, r8 430 addi r27, r27, 1 431 cmpdi cr7, r27, ITERATIONS 432 beq cr7, L(default) 433 mr r29, r8 434 srdi r4, r28, 8 435 /* Check if the first char is present. */ 436 bl STRCHR 437 nop 438 mr r5, r3 439 mr r3, r29 440 mr r29, r5 441 sldi r4, r28, 56 442 srdi r4, r4, 56 443 bl STRCHR 444 nop 445 cmpdi cr7, r29, 0 446 beq cr7, L(nextpos) 447 cmpdi cr7, r3, 0 448 beq cr7, L(skipcheck1) 449 cmpw cr7, r3, r29 450 ble cr7, L(nextpos) 451 /* Move r3 to first occurence. */ 452L(skipcheck1): 453 mr r3, r29 454L(nextpos): 455 mr r29, r3 456 cmpdi cr7, r3, 0 457 ble cr7, L(retnull) 458L(bytebybyte): 459 ld r10, __libc_tsd_LOCALE@got@tprel(r2) 460 add r11, r10, __libc_tsd_LOCALE@tls 461 ld r11, 0(r11) 462 ld r11, LOCALE_CTYPE_TOLOWER(r11) 463 mr r4, r30 /* Restore r4. */ 464 mr r8, r3 /* Save r3. */ 465 addi r8, r8, 1 466 467L(loop): 468 addi r3, r3, 1 469 lbz r5, 0(r3) /* Load byte from r3. */ 470 addi r4, r4, 1 /* Increment r4. */ 471 lbz r6, 0(r4) /* Load next byte from r4. */ 472 cmpdi cr7, r6, 0 /* Is it null? */ 473 beq cr7, L(updater3) 474 cmpdi cr7, r5, 0 /* Is it null? */ 475 beq cr7, L(retnull) /* If yes, return. */ 476 sldi r10, r5, 2 /* Convert to lower case. */ 477 lwzx r10, r11, r10 478 sldi r7, r6, 2 /* Convert to lower case. */ 479 lwzx r7, r11, r7 480 cmpw cr7, r7, r10 /* Compare with byte from r4. */ 481 bne cr7, L(loop1) 482 b L(loop) 483 484 /* Handling return values. */ 485 .align 4 486L(updater3): 487 subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ 488 b L(end) 489 490 .align 4 491L(ret_r3): 492 mr r3, r29 /* Return point of match. */ 493 b L(end) 494 495 .align 4 496L(retnull): 497 li r3, 0 /* Substring was not found. */ 498 b L(end) 499 500 .align 4 501L(default): 502 mr r4, r30 503 bl __strcasestr_ppc 504 nop 505 506 .align 4 507L(end): 508 addi r1, r1, FRAMESIZE /* Restore stack pointer. */ 509 cfi_adjust_cfa_offset(-FRAMESIZE) 510 ld r0, 16(r1) /* Restore the saved link register. */ 511 ld r27, -40(r1) 512 ld r28, -32(r1) 513 ld r29, -24(r1) /* Restore callers save register r29. */ 514 ld r30, -16(r1) /* Restore callers save register r30. */ 515 ld r31, -8(r1) /* Restore callers save register r31. */ 516 cfi_restore(lr) 517 cfi_restore(r27) 518 cfi_restore(r28) 519 cfi_restore(r29) 520 cfi_restore(r30) 521 cfi_restore(r31) 522 mtlr r0 /* Branch to link register. */ 523 blr 524END (STRCASESTR) 525 526weak_alias (__strcasestr, strcasestr) 527libc_hidden_def (__strcasestr) 528libc_hidden_builtin_def (strcasestr) 529