1/* Optimized strstr implementation for PowerPC64/POWER7. 2 Copyright (C) 2015-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */ 22 23/* The performance gain is obtained using aligned memory access, load 24 * doubleword and usage of cmpb instruction for quicker comparison. */ 25 26#define ITERATIONS 64 27 28#ifndef STRSTR 29# define STRSTR strstr 30#endif 31 32#ifndef STRLEN 33/* For builds with no IFUNC support, local calls should be made to internal 34 GLIBC symbol (created by libc_hidden_builtin_def). */ 35# ifdef SHARED 36# define STRLEN __GI_strlen 37# define STRLEN_is_local 38# else 39# define STRLEN strlen 40# endif 41#endif 42 43#ifndef STRNLEN 44/* For builds with no IFUNC support, local calls should be made to internal 45 GLIBC symbol (created by libc_hidden_builtin_def). */ 46# ifdef SHARED 47# define STRNLEN __GI_strnlen 48# define STRNLEN_is_local 49# else 50# define STRNLEN __strnlen 51# endif 52#endif 53 54#ifndef STRCHR 55# ifdef SHARED 56# define STRCHR __GI_strchr 57# define STRCHR_is_local 58# else 59# define STRCHR strchr 60# endif 61#endif 62 63#define FRAMESIZE (FRAME_MIN_SIZE+32) 64 .machine power7 65/* Can't be ENTRY_TOCLESS due to calling __strstr_ppc which uses r2. */ 66ENTRY (STRSTR, 4) 67 CALL_MCOUNT 2 68 mflr r0 /* Load link register LR to r0. */ 69 std r31, -8(r1) /* Save callers register r31. */ 70 std r30, -16(r1) /* Save callers register r30. */ 71 std r29, -24(r1) /* Save callers register r29. */ 72 std r28, -32(r1) /* Save callers register r28. */ 73 std r0, 16(r1) /* Store the link register. */ 74 cfi_offset(r31, -8) 75 cfi_offset(r30, -16) 76 cfi_offset(r28, -32) 77 cfi_offset(r29, -24) 78 cfi_offset(lr, 16) 79 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ 80 cfi_adjust_cfa_offset(FRAMESIZE) 81 82 dcbt 0, r3 83 dcbt 0, r4 84 cmpdi cr7, r3, 0 85 beq cr7, L(retnull) 86 cmpdi cr7, r4, 0 87 beq cr7, L(retnull) 88 89 mr r29, r3 90 mr r30, r4 91 mr r3, r4 92 bl STRLEN 93#ifndef STRLEN_is_local 94 nop 95#endif 96 97 cmpdi cr7, r3, 0 /* If search str is null. */ 98 beq cr7, L(ret_r3) 99 100 mr r31, r3 101 mr r4, r3 102 mr r3, r29 103 bl STRNLEN 104#ifndef STRNLEN_is_local 105 nop 106#endif 107 108 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ 109 blt cr7, L(retnull) 110 mr r3, r29 111 lbz r4, 0(r30) 112 bl STRCHR 113#ifndef STRCHR_is_local 114 nop 115#endif 116 117 mr r11, r3 118 /* If first char of search str is not present. */ 119 cmpdi cr7, r3, 0 120 ble cr7, L(end) 121 /* Reg r28 is used to count the number of iterations. */ 122 li r28, 0 123 rldicl r8, r3, 0, 52 /* Page cross check. */ 124 cmpldi cr7, r8, 4096-16 125 bgt cr7, L(bytebybyte) 126 127 rldicl r8, r30, 0, 52 128 cmpldi cr7, r8, 4096-16 129 bgt cr7, L(bytebybyte) 130 131 /* If len(r4) < 8 handle in a different way. */ 132 /* Shift position based on null and use cmpb. */ 133 cmpdi cr7, r31, 8 134 blt cr7, L(lessthan8) 135 136 /* Len(r4) >= 8 reaches here. */ 137 mr r8, r3 /* Save r3 for future use. */ 138 mr r4, r30 /* Restore r4. */ 139 li r0, 0 140 rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ 141 clrrdi r4, r4, 3 /* Make r4 aligned to 8. */ 142 ld r6, 0(r4) 143 addi r4, r4, 8 144 cmpdi cr7, r10, 0 /* Check if its already aligned? */ 145 beq cr7, L(begin1) 146#ifdef __LITTLE_ENDIAN__ 147 srd r6, r6, r10 /* Discard unwanted bits. */ 148#else 149 sld r6, r6, r10 150#endif 151 ld r9, 0(r4) 152 subfic r10, r10, 64 153#ifdef __LITTLE_ENDIAN__ 154 sld r9, r9, r10 /* Discard unwanted bits. */ 155#else 156 srd r9, r9, r10 157#endif 158 or r6, r6, r9 /* Form complete search str. */ 159L(begin1): 160 mr r29, r6 161 rlwinm r10, r3, 3, 26, 28 162 clrrdi r3, r3, 3 163 ld r5, 0(r3) 164 cmpb r9, r0, r6 /* Check if input has null. */ 165 cmpdi cr7, r9, 0 166 bne cr7, L(return3) 167 cmpb r9, r0, r5 /* Check if input has null. */ 168#ifdef __LITTLE_ENDIAN__ 169 srd r9, r9, r10 170#else 171 sld r9, r9, r10 172#endif 173 cmpdi cr7, r9, 0 174 bne cr7, L(retnull) 175 176 li r12, -8 /* Shift values. */ 177 li r11, 72 /* Shift values. */ 178 cmpdi cr7, r10, 0 179 beq cr7, L(nextbyte1) 180 mr r12, r10 181 addi r12, r12, -8 182 subfic r11, r12, 64 183 184L(nextbyte1): 185 ldu r7, 8(r3) /* Load next dw. */ 186 addi r12, r12, 8 /* Shift one byte and compare. */ 187 addi r11, r11, -8 188#ifdef __LITTLE_ENDIAN__ 189 srd r9, r5, r12 /* Rotate based on mask. */ 190 sld r10, r7, r11 191#else 192 sld r9, r5, r12 193 srd r10, r7, r11 194#endif 195 /* Form single dw from few bytes on first load and second load. */ 196 or r10, r9, r10 197 /* Check for null in the formed dw. */ 198 cmpb r9, r0, r10 199 cmpdi cr7, r9, 0 200 bne cr7, L(retnull) 201 /* Cmpb search str and input str. */ 202 cmpb r9, r10, r6 203 cmpdi cr7, r9, -1 204 beq cr7, L(match) 205 addi r8, r8, 1 206 b L(begin) 207 208 .align 4 209L(match): 210 /* There is a match of 8 bytes, check next bytes. */ 211 cmpdi cr7, r31, 8 212 beq cr7, L(return) 213 /* Update next starting point r8. */ 214 srdi r9, r11, 3 215 subf r9, r9, r3 216 mr r8, r9 217 218L(secondmatch): 219 mr r5, r7 220 rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ 221 ld r6, 0(r4) 222 addi r4, r4, 8 223 cmpdi cr7, r10, 0 /* Check if its already aligned? */ 224 beq cr7, L(proceed3) 225#ifdef __LITTLE_ENDIAN__ 226 srd r6, r6, r10 /* Discard unwanted bits. */ 227 cmpb r9, r0, r6 228 sld r9, r9, r10 229#else 230 sld r6, r6, r10 231 cmpb r9, r0, r6 232 srd r9, r9, r10 233#endif 234 cmpdi cr7, r9, 0 235 bne cr7, L(proceed3) 236 ld r9, 0(r4) 237 subfic r10, r10, 64 238#ifdef __LITTLE_ENDIAN__ 239 sld r9, r9, r10 /* Discard unwanted bits. */ 240#else 241 srd r9, r9, r10 242#endif 243 or r6, r6, r9 /* Form complete search str. */ 244 245L(proceed3): 246 li r7, 0 247 addi r3, r3, 8 248 cmpb r9, r0, r5 249 cmpdi cr7, r9, 0 250 bne cr7, L(proceed4) 251 ld r7, 0(r3) 252L(proceed4): 253#ifdef __LITTLE_ENDIAN__ 254 srd r9, r5, r12 255 sld r10, r7, r11 256#else 257 sld r9, r5, r12 258 srd r10, r7, r11 259#endif 260 /* Form single dw with few bytes from first and second load. */ 261 or r10, r9, r10 262 cmpb r9, r0, r6 263 cmpdi cr7, r9, 0 264 bne cr7, L(return4) 265 /* Check for null in the formed dw. */ 266 cmpb r9, r0, r10 267 cmpdi cr7, r9, 0 268 bne cr7, L(retnull) 269 /* If the next 8 bytes dont match, start search again. */ 270 cmpb r9, r10, r6 271 cmpdi cr7, r9, -1 272 bne cr7, L(reset) 273 /* If the next 8 bytes match, load and compare next 8. */ 274 b L(secondmatch) 275 276 .align 4 277L(reset): 278 /* Start the search again. */ 279 addi r8, r8, 1 280 b L(begin) 281 282 .align 4 283L(return3): 284 /* Count leading zeros and compare partial dw. */ 285#ifdef __LITTLE_ENDIAN__ 286 addi r7, r9, -1 287 andc r7, r7, r9 288 popcntd r7, r7 289 subfic r7, r7, 64 290 sld r10, r5, r7 291 sld r6, r6, r7 292#else 293 cntlzd r7, r9 294 subfic r7, r7, 64 295 srd r10, r5, r7 296 srd r6, r6, r7 297#endif 298 cmpb r9, r10, r6 299 cmpdi cr7, r9, -1 300 addi r8, r8, 1 301 /* Start search again if there is no match. */ 302 bne cr7, L(begin) 303 /* If the words match, update return values. */ 304 subfic r7, r7, 64 305 srdi r7, r7, 3 306 add r3, r3, r7 307 subf r3, r31, r3 308 b L(end) 309 310 .align 4 311L(return4): 312 /* Count leading zeros and compare partial dw. */ 313#ifdef __LITTLE_ENDIAN__ 314 addi r7, r9, -1 315 andc r7, r7, r9 316 popcntd r7, r7 317 subfic r7, r7, 64 318 sld r10, r10, r7 319 sld r6, r6, r7 320#else 321 cntlzd r7, r9 322 subfic r7, r7, 64 323 srd r10, r10, r7 324 srd r6, r6, r7 325#endif 326 cmpb r9, r10, r6 327 cmpdi cr7, r9, -1 328 addi r8, r8, 1 329 bne cr7, L(begin) 330 subfic r7, r7, 64 331 srdi r11, r11, 3 332 subf r3, r11, r3 333 srdi r7, r7, 3 334 add r3, r3, r7 335 subf r3, r31, r3 336 b L(end) 337 338 .align 4 339L(begin): 340 mr r3, r8 341 /* When our iterations exceed ITERATIONS,fall back to default. */ 342 addi r28, r28, 1 343 cmpdi cr7, r28, ITERATIONS 344 beq cr7, L(default) 345 lbz r4, 0(r30) 346 bl STRCHR 347#ifndef STRCHR_is_local 348 nop 349#endif 350 /* If first char of search str is not present. */ 351 cmpdi cr7, r3, 0 352 ble cr7, L(end) 353 mr r8, r3 354 mr r4, r30 /* Restore r4. */ 355 li r0, 0 356 mr r6, r29 357 clrrdi r4, r4, 3 358 addi r4, r4, 8 359 b L(begin1) 360 361 /* Handle less than 8 search string. */ 362 .align 4 363L(lessthan8): 364 mr r4, r3 365 mr r9, r30 366 li r0, 0 367 368 rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */ 369 srdi r8, r10, 3 /* Padding in bytes. */ 370 clrrdi r9, r9, 3 /* Make r4 aligned to 8. */ 371 ld r6, 0(r9) 372 cmpdi cr7, r10, 0 /* Check if its already aligned? */ 373 beq cr7, L(proceed2) 374#ifdef __LITTLE_ENDIAN__ 375 srd r6, r6, r10 /* Discard unwanted bits. */ 376#else 377 sld r6, r6, r10 378#endif 379 subfic r8, r8, 8 380 cmpd cr7, r8, r31 /* Next load needed? */ 381 bge cr7, L(proceed2) 382 ld r7, 8(r9) 383 subfic r10, r10, 64 384#ifdef __LITTLE_ENDIAN__ 385 sld r7, r7, r10 /* Discard unwanted bits. */ 386#else 387 srd r7, r7, r10 388#endif 389 or r6, r6, r7 /* Form complete search str. */ 390L(proceed2): 391 mr r29, r6 392 rlwinm r10, r3, 3, 26, 28 393 clrrdi r7, r3, 3 /* Make r3 aligned. */ 394 ld r5, 0(r7) 395 sldi r8, r31, 3 396 subfic r8, r8, 64 397#ifdef __LITTLE_ENDIAN__ 398 sld r6, r6, r8 399 cmpb r9, r0, r5 400 srd r9, r9, r10 401#else 402 srd r6, r6, r8 403 cmpb r9, r0, r5 404 sld r9, r9, r10 405#endif 406 cmpdi cr7, r9, 0 407 bne cr7, L(noload) 408 cmpdi cr7, r10, 0 409 beq cr7, L(continue) 410 ld r7, 8(r7) 411L(continue1): 412 mr r12, r10 413 addi r12, r12, -8 414 subfic r11, r12, 64 415 b L(nextbyte) 416 417 .align 4 418L(continue): 419 ld r7, 8(r7) 420 li r12, -8 /* Shift values. */ 421 li r11, 72 /* Shift values. */ 422L(nextbyte): 423 addi r12, r12, 8 /* Mask for rotation. */ 424 addi r11, r11, -8 425#ifdef __LITTLE_ENDIAN__ 426 srd r9, r5, r12 427 sld r10, r7, r11 428 or r10, r9, r10 429 sld r10, r10, r8 430 cmpb r9, r0, r10 431 srd r9, r9, r8 432#else 433 sld r9, r5, r12 434 srd r10, r7, r11 435 or r10, r9, r10 436 srd r10, r10, r8 437 cmpb r9, r0, r10 438 sld r9, r9, r8 439#endif 440 cmpdi cr7, r9, 0 441 bne cr7, L(retnull) 442 cmpb r9, r10, r6 443 cmpdi cr7, r9, -1 444 beq cr7, L(end) 445 addi r3, r4, 1 446 /* When our iterations exceed ITERATIONS,fall back to default. */ 447 addi r28, r28, 1 448 cmpdi cr7, r28, ITERATIONS 449 beq cr7, L(default) 450 lbz r4, 0(r30) 451 bl STRCHR 452#ifndef STRCHR_is_local 453 nop 454#endif 455 /* If first char of search str is not present. */ 456 cmpdi cr7, r3, 0 457 ble cr7, L(end) 458 mr r4, r3 459 mr r6, r29 460 li r0, 0 461 b L(proceed2) 462 463 .align 4 464L(noload): 465 /* Reached null in r3, so skip next load. */ 466 li r7, 0 467 b L(continue1) 468 469 .align 4 470L(return): 471 /* Update return values. */ 472 srdi r9, r11, 3 473 subf r3, r9, r3 474 b L(end) 475 476 /* Handling byte by byte. */ 477 .align 4 478L(bytebybyte): 479 mr r8, r3 480 addi r8, r8, -1 481L(loop1): 482 addi r8, r8, 1 483 mr r3, r8 484 mr r4, r30 485 lbz r6, 0(r4) 486 cmpdi cr7, r6, 0 487 beq cr7, L(updater3) 488L(loop): 489 lbz r5, 0(r3) 490 cmpdi cr7, r5, 0 491 beq cr7, L(retnull) 492 cmpld cr7, r6, r5 493 bne cr7, L(loop1) 494 addi r3, r3, 1 495 addi r4, r4, 1 496 lbz r6, 0(r4) 497 cmpdi cr7, r6, 0 498 beq cr7, L(updater3) 499 b L(loop) 500 501 /* Handling return values. */ 502 .align 4 503L(updater3): 504 subf r3, r31, r3 /* Reduce len of r4 from r3. */ 505 b L(end) 506 507 .align 4 508L(ret_r3): 509 mr r3, r29 /* Return r3. */ 510 b L(end) 511 512 .align 4 513L(retnull): 514 li r3, 0 /* Return NULL. */ 515 b L(end) 516 517 .align 4 518L(default): 519 mr r4, r30 520 bl __strstr_ppc 521 nop 522 523 .align 4 524L(end): 525 addi r1, r1, FRAMESIZE /* Restore stack pointer. */ 526 cfi_adjust_cfa_offset(-FRAMESIZE) 527 ld r0, 16(r1) /* Restore the saved link register. */ 528 ld r28, -32(r1) /* Restore callers save register r28. */ 529 ld r29, -24(r1) /* Restore callers save register r29. */ 530 ld r30, -16(r1) /* Restore callers save register r30. */ 531 ld r31, -8(r1) /* Restore callers save register r31. */ 532 mtlr r0 /* Branch to link register. */ 533 blr 534END (STRSTR) 535libc_hidden_builtin_def (strstr) 536