1/* Optimized memcmp implementation for POWER7/PowerPC64. 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* int [r3] memcmp (const char *s1 [r3], 22 const char *s2 [r4], 23 size_t size [r5]) */ 24 25#ifndef MEMCMP 26# define MEMCMP memcmp 27#endif 28 .machine power8 29ENTRY_TOCLESS (MEMCMP, 4) 30 CALL_MCOUNT 3 31 32#define rRTN r3 33#define rSTR1 r3 /* First string arg. */ 34#define rSTR2 r4 /* Second string arg. */ 35#define rN r5 /* Max string length. */ 36#define rWORD1 r6 /* Current word in s1. */ 37#define rWORD2 r7 /* Current word in s2. */ 38#define rWORD3 r8 /* Next word in s1. */ 39#define rWORD4 r9 /* Next word in s2. */ 40#define rWORD5 r10 /* Next word in s1. */ 41#define rWORD6 r11 /* Next word in s2. */ 42 43#define rOFF8 r20 /* 8 bytes offset. */ 44#define rOFF16 r21 /* 16 bytes offset. */ 45#define rOFF24 r22 /* 24 bytes offset. */ 46#define rOFF32 r23 /* 24 bytes offset. */ 47#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ 48#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ 49#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ 50#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ 51#define rSHR r28 /* Unaligned shift right count. */ 52#define rSHL r29 /* Unaligned shift left count. */ 53#define rWORD7 r30 /* Next word in s1. */ 54#define rWORD8 r31 /* Next word in s2. */ 55 56#define rWORD8SAVE (-8) 57#define rWORD7SAVE (-16) 58#define rOFF8SAVE (-24) 59#define rOFF16SAVE (-32) 60#define rOFF24SAVE (-40) 61#define rOFF32SAVE (-48) 62#define rSHRSAVE (-56) 63#define rSHLSAVE (-64) 64#define rWORD8SHIFTSAVE (-72) 65#define rWORD2SHIFTSAVE (-80) 66#define rWORD4SHIFTSAVE (-88) 67#define rWORD6SHIFTSAVE (-96) 68 69#ifdef __LITTLE_ENDIAN__ 70# define LD ldbrx 71#else 72# define LD ldx 73#endif 74 75 xor r10, rSTR2, rSTR1 76 cmpldi cr6, rN, 0 77 cmpldi cr1, rN, 8 78 clrldi. r0, r10, 61 79 clrldi r12, rSTR1, 61 80 cmpldi cr5, r12, 0 81 beq- cr6, L(zeroLength) 82 dcbt 0, rSTR1 83 dcbt 0, rSTR2 84 /* If less than 8 bytes or not aligned, use the unaligned 85 byte loop. */ 86 blt cr1, L(bytealigned) 87 bne L(unalignedqw) 88/* At this point we know both strings have the same alignment and the 89 compare length is at least 8 bytes. r12 contains the low order 90 3 bits of rSTR1 and cr5 contains the result of the logical compare 91 of r12 to 0. If r12 == 0 then we are already double word 92 aligned and can perform the DW aligned loop. */ 93 94 .align 4 95L(samealignment): 96 or r11, rSTR2, rSTR1 97 clrldi. r11, r11, 60 98 beq L(qw_align) 99 /* Try to align to QW else proceed to DW loop. */ 100 clrldi. r10, r10, 60 101 bne L(DW) 102 /* For the difference to reach QW alignment, load as DW. */ 103 clrrdi rSTR1, rSTR1, 3 104 clrrdi rSTR2, rSTR2, 3 105 subfic r10, r12, 8 106 LD rWORD1, 0, rSTR1 107 LD rWORD2, 0, rSTR2 108 sldi r9, r10, 3 109 subfic r9, r9, 64 110 sld rWORD1, rWORD1, r9 111 sld rWORD2, rWORD2, r9 112 cmpld cr6, rWORD1, rWORD2 113 addi rSTR1, rSTR1, 8 114 addi rSTR2, rSTR2, 8 115 bne cr6, L(ret_diff) 116 subf rN, r10, rN 117 118 cmpld cr6, r11, r12 119 bgt cr6, L(qw_align) 120 LD rWORD1, 0, rSTR1 121 LD rWORD2, 0, rSTR2 122 cmpld cr6, rWORD1, rWORD2 123 addi rSTR1, rSTR1, 8 124 addi rSTR2, rSTR2, 8 125 bne cr6, L(different) 126 cmpldi cr6, rN, 8 127 ble cr6, L(zeroLength) 128 addi rN, rN, -8 129 /* Now both rSTR1 and rSTR2 are aligned to QW. */ 130 .align 4 131L(qw_align): 132 vspltisb v0, 0 133 srdi. r6, rN, 6 134 li r8, 16 135 li r10, 32 136 li r11, 48 137 ble cr0, L(lessthan64) 138 mtctr r6 139 vspltisb v8, 0 140 vspltisb v6, 0 141 /* Aligned vector loop. */ 142 .align 4 143L(aligned_loop): 144 lvx v4, 0, rSTR1 145 lvx v5, 0, rSTR2 146 vcmpequb. v7, v6, v8 147 bnl cr6, L(different3) 148 lvx v6, rSTR1, r8 149 lvx v8, rSTR2, r8 150 vcmpequb. v7, v5, v4 151 bnl cr6, L(different2) 152 lvx v4, rSTR1, r10 153 lvx v5, rSTR2, r10 154 vcmpequb. v7, v6, v8 155 bnl cr6, L(different3) 156 lvx v6, rSTR1, r11 157 lvx v8, rSTR2, r11 158 vcmpequb. v7, v5, v4 159 bnl cr6, L(different2) 160 addi rSTR1, rSTR1, 64 161 addi rSTR2, rSTR2, 64 162 bdnz L(aligned_loop) 163 vcmpequb. v7, v6, v8 164 bnl cr6, L(different3) 165 clrldi rN, rN, 58 166 /* Handle remainder for aligned loop. */ 167 .align 4 168L(lessthan64): 169 mr r9, rSTR1 170 cmpdi cr6, rN, 0 171 li rSTR1, 0 172 blelr cr6 173 lvx v4, 0, r9 174 lvx v5, 0, rSTR2 175 vcmpequb. v7, v5, v4 176 bnl cr6, L(different1) 177 addi rN, rN, -16 178 179 cmpdi cr6, rN, 0 180 blelr cr6 181 lvx v4, r9, r8 182 lvx v5, rSTR2, r8 183 vcmpequb. v7, v5, v4 184 bnl cr6, L(different1) 185 addi rN, rN, -16 186 187 cmpdi cr6, rN, 0 188 blelr cr6 189 lvx v4, r9, r10 190 lvx v5, rSTR2, r10 191 vcmpequb. v7, v5, v4 192 bnl cr6, L(different1) 193 addi rN, rN, -16 194 195 cmpdi cr6, rN, 0 196 blelr cr6 197 lvx v4, r9, r11 198 lvx v5, rSTR2, r11 199 vcmpequb. v7, v5, v4 200 bnl cr6, L(different1) 201 blr 202 203 /* Calculate and return the difference. */ 204 .align 4 205L(different1): 206 cmpdi cr6, rN, 16 207 bge cr6, L(different2) 208 /* Discard unwanted bytes. */ 209#ifdef __LITTLE_ENDIAN__ 210 lvsr v1, 0, rN 211 vperm v4, v4, v0, v1 212 vperm v5, v5, v0, v1 213#else 214 lvsl v1, 0, rN 215 vperm v4, v0, v4, v1 216 vperm v5, v0, v5, v1 217#endif 218 vcmpequb. v7, v4, v5 219 li rRTN, 0 220 bltlr cr6 221 .align 4 222L(different2): 223#ifdef __LITTLE_ENDIAN__ 224 /* Reverse bytes for direct comparison. */ 225 lvsl v10, r0, r0 226 vspltisb v8, 15 227 vsububm v9, v8, v10 228 vperm v4, v4, v0, v9 229 vperm v5, v5, v0, v9 230#endif 231 mfvrd r7, v4 232 mfvrd r9, v5 233 cmpld cr6, r7, r9 234 bne cr6, L(ret_diff) 235 /* Difference in second DW. */ 236 vsldoi v4, v4, v4, 8 237 vsldoi v5, v5, v5, 8 238 mfvrd r7, v4 239 mfvrd r9, v5 240 cmpld cr6, r7, r9 241L(ret_diff): 242 li rRTN, 1 243 bgtlr cr6 244 li rRTN, -1 245 blr 246 .align 4 247L(different3): 248#ifdef __LITTLE_ENDIAN__ 249 /* Reverse bytes for direct comparison. */ 250 vspltisb v9, 15 251 lvsl v10, r0, r0 252 vsububm v9, v9, v10 253 vperm v6, v6, v0, v9 254 vperm v8, v8, v0, v9 255#endif 256 mfvrd r7, v6 257 mfvrd r9, v8 258 cmpld cr6, r7, r9 259 bne cr6, L(ret_diff) 260 /* Difference in second DW. */ 261 vsldoi v6, v6, v6, 8 262 vsldoi v8, v8, v8, 8 263 mfvrd r7, v6 264 mfvrd r9, v8 265 cmpld cr6, r7, r9 266 li rRTN, 1 267 bgtlr cr6 268 li rRTN, -1 269 blr 270 271 .align 4 272L(different): 273 cmpldi cr7, rN, 8 274 bgt cr7, L(end) 275 /* Skip unwanted bytes. */ 276 sldi r8, rN, 3 277 subfic r8, r8, 64 278 srd rWORD1, rWORD1, r8 279 srd rWORD2, rWORD2, r8 280 cmpld cr6, rWORD1, rWORD2 281 li rRTN, 0 282 beqlr cr6 283L(end): 284 li rRTN, 1 285 bgtlr cr6 286 li rRTN, -1 287 blr 288 289 .align 4 290L(unalignedqw): 291 /* Proceed to DW unaligned loop,if there is a chance of pagecross. */ 292 rldicl r9, rSTR1, 0, 52 293 add r9, r9, rN 294 cmpldi cr0, r9, 4096-16 295 bgt cr0, L(unaligned) 296 rldicl r9, rSTR2, 0, 52 297 add r9, r9, rN 298 cmpldi cr0, r9, 4096-16 299 bgt cr0, L(unaligned) 300 li r0, 0 301 li r8, 16 302 vspltisb v0, 0 303 /* Check if rSTR1 is aligned to QW. */ 304 andi. r11, rSTR1, 0xF 305 beq L(s1_align) 306 307 /* Compare 16B and align S1 to QW. */ 308#ifdef __LITTLE_ENDIAN__ 309 lvsr v10, 0, rSTR1 /* Compute mask. */ 310 lvsr v6, 0, rSTR2 /* Compute mask. */ 311#else 312 lvsl v10, 0, rSTR1 /* Compute mask. */ 313 lvsl v6, 0, rSTR2 /* Compute mask. */ 314#endif 315 lvx v5, 0, rSTR2 316 lvx v9, rSTR2, r8 317#ifdef __LITTLE_ENDIAN__ 318 vperm v5, v9, v5, v6 319#else 320 vperm v5, v5, v9, v6 321#endif 322 lvx v4, 0, rSTR1 323 lvx v9, rSTR1, r8 324#ifdef __LITTLE_ENDIAN__ 325 vperm v4, v9, v4, v10 326#else 327 vperm v4, v4, v9, v10 328#endif 329 vcmpequb. v7, v5, v4 330 bnl cr6, L(different1) 331 cmpldi cr6, rN, 16 332 ble cr6, L(zeroLength) 333 subfic r11, r11, 16 334 subf rN, r11, rN 335 add rSTR1, rSTR1, r11 336 add rSTR2, rSTR2, r11 337 338 /* As s1 is QW aligned prepare for unaligned loop. */ 339 .align 4 340L(s1_align): 341#ifdef __LITTLE_ENDIAN__ 342 lvsr v6, 0, rSTR2 343#else 344 lvsl v6, 0, rSTR2 345#endif 346 lvx v5, 0, rSTR2 347 srdi. r6, rN, 6 348 li r10, 32 349 li r11, 48 350 ble cr0, L(lessthan64_unalign) 351 mtctr r6 352 li r9, 64 353 /* Unaligned vector loop. */ 354 .align 4 355L(unalign_qwloop): 356 lvx v4, 0, rSTR1 357 lvx v10, rSTR2, r8 358#ifdef __LITTLE_ENDIAN__ 359 vperm v5, v10, v5, v6 360#else 361 vperm v5, v5, v10, v6 362#endif 363 vcmpequb. v7, v5, v4 364 bnl cr6, L(different2) 365 vor v5, v10, v10 366 lvx v4, rSTR1, r8 367 lvx v10, rSTR2, r10 368#ifdef __LITTLE_ENDIAN__ 369 vperm v5, v10, v5, v6 370#else 371 vperm v5, v5, v10, v6 372#endif 373 vcmpequb. v7, v5, v4 374 bnl cr6, L(different2) 375 vor v5, v10, v10 376 lvx v4, rSTR1, r10 377 lvx v10, rSTR2, r11 378#ifdef __LITTLE_ENDIAN__ 379 vperm v5, v10, v5, v6 380#else 381 vperm v5, v5, v10, v6 382#endif 383 vcmpequb. v7, v5, v4 384 bnl cr6, L(different2) 385 vor v5, v10, v10 386 lvx v4, rSTR1, r11 387 lvx v10, rSTR2, r9 388#ifdef __LITTLE_ENDIAN__ 389 vperm v5, v10, v5, v6 390#else 391 vperm v5, v5, v10, v6 392#endif 393 vcmpequb. v7, v5, v4 394 bnl cr6, L(different2) 395 vor v5, v10, v10 396 addi rSTR1, rSTR1, 64 397 addi rSTR2, rSTR2, 64 398 bdnz L(unalign_qwloop) 399 clrldi rN, rN, 58 400 /* Handle remainder for unaligned loop. */ 401 .align 4 402L(lessthan64_unalign): 403 mr r9, rSTR1 404 cmpdi cr6, rN, 0 405 li rSTR1, 0 406 blelr cr6 407 lvx v4, 0, r9 408 lvx v10, rSTR2, r8 409#ifdef __LITTLE_ENDIAN__ 410 vperm v5, v10, v5, v6 411#else 412 vperm v5, v5, v10, v6 413#endif 414 vcmpequb. v7, v5, v4 415 bnl cr6, L(different1) 416 vor v5, v10, v10 417 addi rN, rN, -16 418 419 cmpdi cr6, rN, 0 420 blelr cr6 421 lvx v4, r9, r8 422 lvx v10, rSTR2, r10 423#ifdef __LITTLE_ENDIAN__ 424 vperm v5, v10, v5, v6 425#else 426 vperm v5, v5, v10, v6 427#endif 428 vcmpequb. v7, v5, v4 429 bnl cr6, L(different1) 430 vor v5, v10, v10 431 addi rN, rN, -16 432 433 cmpdi cr6, rN, 0 434 blelr cr6 435 lvx v4, r9, r10 436 lvx v10, rSTR2, r11 437#ifdef __LITTLE_ENDIAN__ 438 vperm v5, v10, v5, v6 439#else 440 vperm v5, v5, v10, v6 441#endif 442 vcmpequb. v7, v5, v4 443 bnl cr6, L(different1) 444 vor v5, v10, v10 445 addi rN, rN, -16 446 447 cmpdi cr6, rN, 0 448 blelr cr6 449 lvx v4, r9, r11 450 addi r11, r11, 16 451 lvx v10, rSTR2, r11 452#ifdef __LITTLE_ENDIAN__ 453 vperm v5, v10, v5, v6 454#else 455 vperm v5, v5, v10, v6 456#endif 457 vcmpequb. v7, v5, v4 458 bnl cr6, L(different1) 459 blr 460 461/* Otherwise we know the two strings have the same alignment (but not 462 yet DW). So we force the string addresses to the next lower DW 463 boundary and special case this first DW using shift left to 464 eliminate bits preceding the first byte. Since we want to join the 465 normal (DW aligned) compare loop, starting at the second double word, 466 we need to adjust the length (rN) and special case the loop 467 versioning for the first DW. This ensures that the loop count is 468 correct and the first DW (shifted) is in the expected register pair. */ 469 .align 4 470L(DW): 471 std rWORD8, rWORD8SAVE(r1) 472 std rWORD7, rWORD7SAVE(r1) 473 std rOFF8, rOFF8SAVE(r1) 474 std rOFF16, rOFF16SAVE(r1) 475 std rOFF24, rOFF24SAVE(r1) 476 std rOFF32, rOFF32SAVE(r1) 477 cfi_offset(rWORD8, rWORD8SAVE) 478 cfi_offset(rWORD7, rWORD7SAVE) 479 cfi_offset(rOFF8, rOFF8SAVE) 480 cfi_offset(rOFF16, rOFF16SAVE) 481 cfi_offset(rOFF24, rOFF24SAVE) 482 cfi_offset(rOFF32, rOFF32SAVE) 483 484 li rOFF8,8 485 li rOFF16,16 486 li rOFF24,24 487 li rOFF32,32 488 clrrdi rSTR1, rSTR1, 3 489 clrrdi rSTR2, rSTR2, 3 490 beq cr5, L(DWaligned) 491 add rN, rN, r12 492 sldi rWORD6, r12, 3 493 srdi r0, rN, 5 /* Divide by 32. */ 494 andi. r12, rN, 24 /* Get the DW remainder. */ 495 LD rWORD1, 0, rSTR1 496 LD rWORD2, 0, rSTR2 497 cmpldi cr1, r12, 16 498 cmpldi cr7, rN, 32 499 clrldi rN, rN, 61 500 beq L(dPs4) 501 mtctr r0 502 bgt cr1, L(dPs3) 503 beq cr1, L(dPs2) 504 505/* Remainder is 8. */ 506 .align 3 507L(dsP1): 508 sld rWORD5, rWORD1, rWORD6 509 sld rWORD6, rWORD2, rWORD6 510 cmpld cr5, rWORD5, rWORD6 511 blt cr7, L(dP1x) 512/* Do something useful in this cycle since we have to branch anyway. */ 513 LD rWORD1, rOFF8, rSTR1 514 LD rWORD2, rOFF8, rSTR2 515 cmpld cr7, rWORD1, rWORD2 516 b L(dP1e) 517/* Remainder is 16. */ 518 .align 4 519L(dPs2): 520 sld rWORD5, rWORD1, rWORD6 521 sld rWORD6, rWORD2, rWORD6 522 cmpld cr6, rWORD5, rWORD6 523 blt cr7, L(dP2x) 524/* Do something useful in this cycle since we have to branch anyway. */ 525 LD rWORD7, rOFF8, rSTR1 526 LD rWORD8, rOFF8, rSTR2 527 cmpld cr5, rWORD7, rWORD8 528 b L(dP2e) 529/* Remainder is 24. */ 530 .align 4 531L(dPs3): 532 sld rWORD3, rWORD1, rWORD6 533 sld rWORD4, rWORD2, rWORD6 534 cmpld cr1, rWORD3, rWORD4 535 b L(dP3e) 536/* Count is a multiple of 32, remainder is 0. */ 537 .align 4 538L(dPs4): 539 mtctr r0 540 sld rWORD1, rWORD1, rWORD6 541 sld rWORD2, rWORD2, rWORD6 542 cmpld cr7, rWORD1, rWORD2 543 b L(dP4e) 544 545/* At this point we know both strings are double word aligned and the 546 compare length is at least 8 bytes. */ 547 .align 4 548L(DWaligned): 549 andi. r12, rN, 24 /* Get the DW remainder. */ 550 srdi r0, rN, 5 /* Divide by 32. */ 551 cmpldi cr1, r12, 16 552 cmpldi cr7, rN, 32 553 clrldi rN, rN, 61 554 beq L(dP4) 555 bgt cr1, L(dP3) 556 beq cr1, L(dP2) 557 558/* Remainder is 8. */ 559 .align 4 560L(dP1): 561 mtctr r0 562/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early 563 (8-15 byte compare), we want to use only volatile registers. This 564 means we can avoid restoring non-volatile registers since we did not 565 change any on the early exit path. The key here is the non-early 566 exit path only cares about the condition code (cr5), not about which 567 register pair was used. */ 568 LD rWORD5, 0, rSTR1 569 LD rWORD6, 0, rSTR2 570 cmpld cr5, rWORD5, rWORD6 571 blt cr7, L(dP1x) 572 LD rWORD1, rOFF8, rSTR1 573 LD rWORD2, rOFF8, rSTR2 574 cmpld cr7, rWORD1, rWORD2 575L(dP1e): 576 LD rWORD3, rOFF16, rSTR1 577 LD rWORD4, rOFF16, rSTR2 578 cmpld cr1, rWORD3, rWORD4 579 LD rWORD5, rOFF24, rSTR1 580 LD rWORD6, rOFF24, rSTR2 581 cmpld cr6, rWORD5, rWORD6 582 bne cr5, L(dLcr5x) 583 bne cr7, L(dLcr7x) 584 585 LD rWORD7, rOFF32, rSTR1 586 LD rWORD8, rOFF32, rSTR2 587 addi rSTR1, rSTR1, 32 588 addi rSTR2, rSTR2, 32 589 bne cr1, L(dLcr1) 590 cmpld cr5, rWORD7, rWORD8 591 bdnz L(dLoop) 592 bne cr6, L(dLcr6) 593 ld rWORD8, rWORD8SAVE(r1) 594 ld rWORD7, rWORD7SAVE(r1) 595 .align 3 596L(dP1x): 597 sldi. r12, rN, 3 598 bne cr5, L(dLcr5x) 599 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 600 bne L(d00) 601 ld rOFF8, rOFF8SAVE(r1) 602 ld rOFF16, rOFF16SAVE(r1) 603 ld rOFF24, rOFF24SAVE(r1) 604 ld rOFF32, rOFF32SAVE(r1) 605 li rRTN, 0 606 blr 607 608/* Remainder is 16. */ 609 .align 4 610L(dP2): 611 mtctr r0 612 LD rWORD5, 0, rSTR1 613 LD rWORD6, 0, rSTR2 614 cmpld cr6, rWORD5, rWORD6 615 blt cr7, L(dP2x) 616 LD rWORD7, rOFF8, rSTR1 617 LD rWORD8, rOFF8, rSTR2 618 cmpld cr5, rWORD7, rWORD8 619L(dP2e): 620 LD rWORD1, rOFF16, rSTR1 621 LD rWORD2, rOFF16, rSTR2 622 cmpld cr7, rWORD1, rWORD2 623 LD rWORD3, rOFF24, rSTR1 624 LD rWORD4, rOFF24, rSTR2 625 cmpld cr1, rWORD3, rWORD4 626 addi rSTR1, rSTR1, 8 627 addi rSTR2, rSTR2, 8 628 bne cr6, L(dLcr6) 629 bne cr5, L(dLcr5) 630 b L(dLoop2) 631 .align 4 632L(dP2x): 633 LD rWORD3, rOFF8, rSTR1 634 LD rWORD4, rOFF8, rSTR2 635 cmpld cr1, rWORD3, rWORD4 636 sldi. r12, rN, 3 637 bne cr6, L(dLcr6x) 638 addi rSTR1, rSTR1, 8 639 addi rSTR2, rSTR2, 8 640 bne cr1, L(dLcr1x) 641 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 642 bne L(d00) 643 ld rOFF8, rOFF8SAVE(r1) 644 ld rOFF16, rOFF16SAVE(r1) 645 ld rOFF24, rOFF24SAVE(r1) 646 ld rOFF32, rOFF32SAVE(r1) 647 li rRTN, 0 648 blr 649 650/* Remainder is 24. */ 651 .align 4 652L(dP3): 653 mtctr r0 654 LD rWORD3, 0, rSTR1 655 LD rWORD4, 0, rSTR2 656 cmpld cr1, rWORD3, rWORD4 657L(dP3e): 658 LD rWORD5, rOFF8, rSTR1 659 LD rWORD6, rOFF8, rSTR2 660 cmpld cr6, rWORD5, rWORD6 661 blt cr7, L(dP3x) 662 LD rWORD7, rOFF16, rSTR1 663 LD rWORD8, rOFF16, rSTR2 664 cmpld cr5, rWORD7, rWORD8 665 LD rWORD1, rOFF24, rSTR1 666 LD rWORD2, rOFF24, rSTR2 667 cmpld cr7, rWORD1, rWORD2 668 addi rSTR1, rSTR1, 16 669 addi rSTR2, rSTR2, 16 670 bne cr1, L(dLcr1) 671 bne cr6, L(dLcr6) 672 b L(dLoop1) 673/* Again we are on a early exit path (24-31 byte compare), we want to 674 only use volatile registers and avoid restoring non-volatile 675 registers. */ 676 .align 4 677L(dP3x): 678 LD rWORD1, rOFF16, rSTR1 679 LD rWORD2, rOFF16, rSTR2 680 cmpld cr7, rWORD1, rWORD2 681 sldi. r12, rN, 3 682 bne cr1, L(dLcr1x) 683 addi rSTR1, rSTR1, 16 684 addi rSTR2, rSTR2, 16 685 bne cr6, L(dLcr6x) 686 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 687 bne cr7, L(dLcr7x) 688 bne L(d00) 689 ld rOFF8, rOFF8SAVE(r1) 690 ld rOFF16, rOFF16SAVE(r1) 691 ld rOFF24, rOFF24SAVE(r1) 692 ld rOFF32, rOFF32SAVE(r1) 693 li rRTN, 0 694 blr 695 696/* Count is a multiple of 32, remainder is 0. */ 697 .align 4 698L(dP4): 699 mtctr r0 700 LD rWORD1, 0, rSTR1 701 LD rWORD2, 0, rSTR2 702 cmpld cr7, rWORD1, rWORD2 703L(dP4e): 704 LD rWORD3, rOFF8, rSTR1 705 LD rWORD4, rOFF8, rSTR2 706 cmpld cr1, rWORD3, rWORD4 707 LD rWORD5, rOFF16, rSTR1 708 LD rWORD6, rOFF16, rSTR2 709 cmpld cr6, rWORD5, rWORD6 710 LD rWORD7, rOFF24, rSTR1 711 LD rWORD8, rOFF24, rSTR2 712 addi rSTR1, rSTR1, 24 713 addi rSTR2, rSTR2, 24 714 cmpld cr5, rWORD7, rWORD8 715 bne cr7, L(dLcr7) 716 bne cr1, L(dLcr1) 717 bdz- L(d24) /* Adjust CTR as we start with +4. */ 718/* This is the primary loop. */ 719 .align 4 720L(dLoop): 721 LD rWORD1, rOFF8, rSTR1 722 LD rWORD2, rOFF8, rSTR2 723 cmpld cr1, rWORD3, rWORD4 724 bne cr6, L(dLcr6) 725L(dLoop1): 726 LD rWORD3, rOFF16, rSTR1 727 LD rWORD4, rOFF16, rSTR2 728 cmpld cr6, rWORD5, rWORD6 729 bne cr5, L(dLcr5) 730L(dLoop2): 731 LD rWORD5, rOFF24, rSTR1 732 LD rWORD6, rOFF24, rSTR2 733 cmpld cr5, rWORD7, rWORD8 734 bne cr7, L(dLcr7) 735L(dLoop3): 736 LD rWORD7, rOFF32, rSTR1 737 LD rWORD8, rOFF32, rSTR2 738 addi rSTR1, rSTR1, 32 739 addi rSTR2, rSTR2, 32 740 bne cr1, L(dLcr1) 741 cmpld cr7, rWORD1, rWORD2 742 bdnz L(dLoop) 743 744L(dL4): 745 cmpld cr1, rWORD3, rWORD4 746 bne cr6, L(dLcr6) 747 cmpld cr6, rWORD5, rWORD6 748 bne cr5, L(dLcr5) 749 cmpld cr5, rWORD7, rWORD8 750L(d44): 751 bne cr7, L(dLcr7) 752L(d34): 753 bne cr1, L(dLcr1) 754L(d24): 755 bne cr6, L(dLcr6) 756L(d14): 757 sldi. r12, rN, 3 758 bne cr5, L(dLcr5) 759L(d04): 760 ld rWORD8, rWORD8SAVE(r1) 761 ld rWORD7, rWORD7SAVE(r1) 762 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 763 beq L(duzeroLength) 764/* At this point we have a remainder of 1 to 7 bytes to compare. Since 765 we are aligned it is safe to load the whole double word, and use 766 shift right double to eliminate bits beyond the compare length. */ 767L(d00): 768 LD rWORD1, rOFF8, rSTR1 769 LD rWORD2, rOFF8, rSTR2 770 srd rWORD1, rWORD1, rN 771 srd rWORD2, rWORD2, rN 772 cmpld cr7, rWORD1, rWORD2 773 bne cr7, L(dLcr7x) 774 ld rOFF8, rOFF8SAVE(r1) 775 ld rOFF16, rOFF16SAVE(r1) 776 ld rOFF24, rOFF24SAVE(r1) 777 ld rOFF32, rOFF32SAVE(r1) 778 li rRTN, 0 779 blr 780 781 .align 4 782L(dLcr7): 783 ld rWORD8, rWORD8SAVE(r1) 784 ld rWORD7, rWORD7SAVE(r1) 785L(dLcr7x): 786 ld rOFF8, rOFF8SAVE(r1) 787 ld rOFF16, rOFF16SAVE(r1) 788 ld rOFF24, rOFF24SAVE(r1) 789 ld rOFF32, rOFF32SAVE(r1) 790 li rRTN, 1 791 bgtlr cr7 792 li rRTN, -1 793 blr 794 .align 4 795L(dLcr1): 796 ld rWORD8, rWORD8SAVE(r1) 797 ld rWORD7, rWORD7SAVE(r1) 798L(dLcr1x): 799 ld rOFF8, rOFF8SAVE(r1) 800 ld rOFF16, rOFF16SAVE(r1) 801 ld rOFF24, rOFF24SAVE(r1) 802 ld rOFF32, rOFF32SAVE(r1) 803 li rRTN, 1 804 bgtlr cr1 805 li rRTN, -1 806 blr 807 .align 4 808L(dLcr6): 809 ld rWORD8, rWORD8SAVE(r1) 810 ld rWORD7, rWORD7SAVE(r1) 811L(dLcr6x): 812 ld rOFF8, rOFF8SAVE(r1) 813 ld rOFF16, rOFF16SAVE(r1) 814 ld rOFF24, rOFF24SAVE(r1) 815 ld rOFF32, rOFF32SAVE(r1) 816 li rRTN, 1 817 bgtlr cr6 818 li rRTN, -1 819 blr 820 .align 4 821L(dLcr5): 822 ld rWORD8, rWORD8SAVE(r1) 823 ld rWORD7, rWORD7SAVE(r1) 824L(dLcr5x): 825 ld rOFF8, rOFF8SAVE(r1) 826 ld rOFF16, rOFF16SAVE(r1) 827 ld rOFF24, rOFF24SAVE(r1) 828 ld rOFF32, rOFF32SAVE(r1) 829 li rRTN, 1 830 bgtlr cr5 831 li rRTN, -1 832 blr 833 834 .align 4 835L(bytealigned): 836 mtctr rN 837 838/* We need to prime this loop. This loop is swing modulo scheduled 839 to avoid pipe delays. The dependent instruction latencies (load to 840 compare to conditional branch) is 2 to 3 cycles. In this loop each 841 dispatch group ends in a branch and takes 1 cycle. Effectively 842 the first iteration of the loop only serves to load operands and 843 branches based on compares are delayed until the next loop. 844 845 So we must precondition some registers and condition codes so that 846 we don't exit the loop early on the first iteration. */ 847 848 lbz rWORD1, 0(rSTR1) 849 lbz rWORD2, 0(rSTR2) 850 bdz L(b11) 851 cmpld cr7, rWORD1, rWORD2 852 lbz rWORD3, 1(rSTR1) 853 lbz rWORD4, 1(rSTR2) 854 bdz L(b12) 855 cmpld cr1, rWORD3, rWORD4 856 lbzu rWORD5, 2(rSTR1) 857 lbzu rWORD6, 2(rSTR2) 858 bdz L(b13) 859 .align 4 860L(bLoop): 861 lbzu rWORD1, 1(rSTR1) 862 lbzu rWORD2, 1(rSTR2) 863 bne cr7, L(bLcr7) 864 865 cmpld cr6, rWORD5, rWORD6 866 bdz L(b3i) 867 868 lbzu rWORD3, 1(rSTR1) 869 lbzu rWORD4, 1(rSTR2) 870 bne cr1, L(bLcr1) 871 872 cmpld cr7, rWORD1, rWORD2 873 bdz L(b2i) 874 875 lbzu rWORD5, 1(rSTR1) 876 lbzu rWORD6, 1(rSTR2) 877 bne cr6, L(bLcr6) 878 879 cmpld cr1, rWORD3, rWORD4 880 bdnz L(bLoop) 881 882/* We speculatively loading bytes before we have tested the previous 883 bytes. But we must avoid overrunning the length (in the ctr) to 884 prevent these speculative loads from causing a segfault. In this 885 case the loop will exit early (before the all pending bytes are 886 tested. In this case we must complete the pending operations 887 before returning. */ 888L(b1i): 889 bne cr7, L(bLcr7) 890 bne cr1, L(bLcr1) 891 b L(bx56) 892 .align 4 893L(b2i): 894 bne cr6, L(bLcr6) 895 bne cr7, L(bLcr7) 896 b L(bx34) 897 .align 4 898L(b3i): 899 bne cr1, L(bLcr1) 900 bne cr6, L(bLcr6) 901 b L(bx12) 902 .align 4 903L(bLcr7): 904 li rRTN, 1 905 bgtlr cr7 906 li rRTN, -1 907 blr 908L(bLcr1): 909 li rRTN, 1 910 bgtlr cr1 911 li rRTN, -1 912 blr 913L(bLcr6): 914 li rRTN, 1 915 bgtlr cr6 916 li rRTN, -1 917 blr 918 919L(b13): 920 bne cr7, L(bx12) 921 bne cr1, L(bx34) 922L(bx56): 923 sub rRTN, rWORD5, rWORD6 924 blr 925 nop 926L(b12): 927 bne cr7, L(bx12) 928L(bx34): 929 sub rRTN, rWORD3, rWORD4 930 blr 931L(b11): 932L(bx12): 933 sub rRTN, rWORD1, rWORD2 934 blr 935 936 .align 4 937L(zeroLength): 938 li rRTN, 0 939 blr 940 941 .align 4 942/* At this point we know the strings have different alignment and the 943 compare length is at least 8 bytes. r12 contains the low order 944 3 bits of rSTR1 and cr5 contains the result of the logical compare 945 of r12 to 0. If r12 == 0 then rStr1 is double word 946 aligned and can perform the DWunaligned loop. 947 948 Otherwise we know that rSTR1 is not already DW aligned yet. 949 So we can force the string addresses to the next lower DW 950 boundary and special case this first DW using shift left to 951 eliminate bits preceding the first byte. Since we want to join the 952 normal (DWaligned) compare loop, starting at the second double word, 953 we need to adjust the length (rN) and special case the loop 954 versioning for the first DW. This ensures that the loop count is 955 correct and the first DW (shifted) is in the expected resister pair. */ 956L(unaligned): 957 std rWORD8, rWORD8SAVE(r1) 958 std rWORD7, rWORD7SAVE(r1) 959 std rOFF8, rOFF8SAVE(r1) 960 std rOFF16, rOFF16SAVE(r1) 961 std rOFF24, rOFF24SAVE(r1) 962 std rOFF32, rOFF32SAVE(r1) 963 cfi_offset(rWORD8, rWORD8SAVE) 964 cfi_offset(rWORD7, rWORD7SAVE) 965 cfi_offset(rOFF8, rOFF8SAVE) 966 cfi_offset(rOFF16, rOFF16SAVE) 967 cfi_offset(rOFF24, rOFF24SAVE) 968 cfi_offset(rOFF32, rOFF32SAVE) 969 li rOFF8,8 970 li rOFF16,16 971 li rOFF24,24 972 li rOFF32,32 973 std rSHL, rSHLSAVE(r1) 974 cfi_offset(rSHL, rSHLSAVE) 975 clrldi rSHL, rSTR2, 61 976 beq cr6, L(duzeroLength) 977 std rSHR, rSHRSAVE(r1) 978 cfi_offset(rSHR, rSHRSAVE) 979 beq cr5, L(DWunaligned) 980 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 981 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) 982/* Adjust the logical start of rSTR2 to compensate for the extra bits 983 in the 1st rSTR1 DW. */ 984 sub rWORD8_SHIFT, rSTR2, r12 985/* But do not attempt to address the DW before that DW that contains 986 the actual start of rSTR2. */ 987 clrrdi rSTR2, rSTR2, 3 988 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 989/* Compute the left/right shift counts for the unaligned rSTR2, 990 compensating for the logical (DW aligned) start of rSTR1. */ 991 clrldi rSHL, rWORD8_SHIFT, 61 992 clrrdi rSTR1, rSTR1, 3 993 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 994 sldi rSHL, rSHL, 3 995 cmpld cr5, rWORD8_SHIFT, rSTR2 996 add rN, rN, r12 997 sldi rWORD6, r12, 3 998 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 999 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) 1000 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) 1001 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) 1002 subfic rSHR, rSHL, 64 1003 srdi r0, rN, 5 /* Divide by 32. */ 1004 andi. r12, rN, 24 /* Get the DW remainder. */ 1005/* We normally need to load 2 DWs to start the unaligned rSTR2, but in 1006 this special case those bits may be discarded anyway. Also we 1007 must avoid loading a DW where none of the bits are part of rSTR2 as 1008 this may cross a page boundary and cause a page fault. */ 1009 li rWORD8, 0 1010 blt cr5, L(dus0) 1011 LD rWORD8, 0, rSTR2 1012 addi rSTR2, rSTR2, 8 1013 sld rWORD8, rWORD8, rSHL 1014 1015L(dus0): 1016 LD rWORD1, 0, rSTR1 1017 LD rWORD2, 0, rSTR2 1018 cmpldi cr1, r12, 16 1019 cmpldi cr7, rN, 32 1020 srd r12, rWORD2, rSHR 1021 clrldi rN, rN, 61 1022 beq L(duPs4) 1023 mtctr r0 1024 or rWORD8, r12, rWORD8 1025 bgt cr1, L(duPs3) 1026 beq cr1, L(duPs2) 1027 1028/* Remainder is 8. */ 1029 .align 4 1030L(dusP1): 1031 sld rWORD8_SHIFT, rWORD2, rSHL 1032 sld rWORD7, rWORD1, rWORD6 1033 sld rWORD8, rWORD8, rWORD6 1034 bge cr7, L(duP1e) 1035/* At this point we exit early with the first double word compare 1036 complete and remainder of 0 to 7 bytes. See L(du14) for details on 1037 how we handle the remaining bytes. */ 1038 cmpld cr5, rWORD7, rWORD8 1039 sldi. rN, rN, 3 1040 bne cr5, L(duLcr5) 1041 cmpld cr7, rN, rSHR 1042 beq L(duZeroReturn) 1043 li r0, 0 1044 ble cr7, L(dutrim) 1045 LD rWORD2, rOFF8, rSTR2 1046 srd r0, rWORD2, rSHR 1047 b L(dutrim) 1048/* Remainder is 16. */ 1049 .align 4 1050L(duPs2): 1051 sld rWORD6_SHIFT, rWORD2, rSHL 1052 sld rWORD5, rWORD1, rWORD6 1053 sld rWORD6, rWORD8, rWORD6 1054 b L(duP2e) 1055/* Remainder is 24. */ 1056 .align 4 1057L(duPs3): 1058 sld rWORD4_SHIFT, rWORD2, rSHL 1059 sld rWORD3, rWORD1, rWORD6 1060 sld rWORD4, rWORD8, rWORD6 1061 b L(duP3e) 1062/* Count is a multiple of 32, remainder is 0. */ 1063 .align 4 1064L(duPs4): 1065 mtctr r0 1066 or rWORD8, r12, rWORD8 1067 sld rWORD2_SHIFT, rWORD2, rSHL 1068 sld rWORD1, rWORD1, rWORD6 1069 sld rWORD2, rWORD8, rWORD6 1070 b L(duP4e) 1071 1072/* At this point we know rSTR1 is double word aligned and the 1073 compare length is at least 8 bytes. */ 1074 .align 4 1075L(DWunaligned): 1076 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 1077 clrrdi rSTR2, rSTR2, 3 1078 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 1079 srdi r0, rN, 5 /* Divide by 32. */ 1080 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 1081 andi. r12, rN, 24 /* Get the DW remainder. */ 1082 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 1083 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) 1084 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) 1085 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) 1086 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) 1087 sldi rSHL, rSHL, 3 1088 LD rWORD6, 0, rSTR2 1089 LD rWORD8, rOFF8, rSTR2 1090 addi rSTR2, rSTR2, 8 1091 cmpldi cr1, r12, 16 1092 cmpldi cr7, rN, 32 1093 clrldi rN, rN, 61 1094 subfic rSHR, rSHL, 64 1095 sld rWORD6_SHIFT, rWORD6, rSHL 1096 beq L(duP4) 1097 mtctr r0 1098 bgt cr1, L(duP3) 1099 beq cr1, L(duP2) 1100 1101/* Remainder is 8. */ 1102 .align 4 1103L(duP1): 1104 srd r12, rWORD8, rSHR 1105 LD rWORD7, 0, rSTR1 1106 sld rWORD8_SHIFT, rWORD8, rSHL 1107 or rWORD8, r12, rWORD6_SHIFT 1108 blt cr7, L(duP1x) 1109L(duP1e): 1110 LD rWORD1, rOFF8, rSTR1 1111 LD rWORD2, rOFF8, rSTR2 1112 cmpld cr5, rWORD7, rWORD8 1113 srd r0, rWORD2, rSHR 1114 sld rWORD2_SHIFT, rWORD2, rSHL 1115 or rWORD2, r0, rWORD8_SHIFT 1116 LD rWORD3, rOFF16, rSTR1 1117 LD rWORD4, rOFF16, rSTR2 1118 cmpld cr7, rWORD1, rWORD2 1119 srd r12, rWORD4, rSHR 1120 sld rWORD4_SHIFT, rWORD4, rSHL 1121 bne cr5, L(duLcr5) 1122 or rWORD4, r12, rWORD2_SHIFT 1123 LD rWORD5, rOFF24, rSTR1 1124 LD rWORD6, rOFF24, rSTR2 1125 cmpld cr1, rWORD3, rWORD4 1126 srd r0, rWORD6, rSHR 1127 sld rWORD6_SHIFT, rWORD6, rSHL 1128 bne cr7, L(duLcr7) 1129 or rWORD6, r0, rWORD4_SHIFT 1130 cmpld cr6, rWORD5, rWORD6 1131 b L(duLoop3) 1132 .align 4 1133/* At this point we exit early with the first double word compare 1134 complete and remainder of 0 to 7 bytes. See L(du14) for details on 1135 how we handle the remaining bytes. */ 1136L(duP1x): 1137 cmpld cr5, rWORD7, rWORD8 1138 sldi. rN, rN, 3 1139 bne cr5, L(duLcr5) 1140 cmpld cr7, rN, rSHR 1141 beq L(duZeroReturn) 1142 li r0, 0 1143 ble cr7, L(dutrim) 1144 LD rWORD2, rOFF8, rSTR2 1145 srd r0, rWORD2, rSHR 1146 b L(dutrim) 1147/* Remainder is 16. */ 1148 .align 4 1149L(duP2): 1150 srd r0, rWORD8, rSHR 1151 LD rWORD5, 0, rSTR1 1152 or rWORD6, r0, rWORD6_SHIFT 1153 sld rWORD6_SHIFT, rWORD8, rSHL 1154L(duP2e): 1155 LD rWORD7, rOFF8, rSTR1 1156 LD rWORD8, rOFF8, rSTR2 1157 cmpld cr6, rWORD5, rWORD6 1158 srd r12, rWORD8, rSHR 1159 sld rWORD8_SHIFT, rWORD8, rSHL 1160 or rWORD8, r12, rWORD6_SHIFT 1161 blt cr7, L(duP2x) 1162 LD rWORD1, rOFF16, rSTR1 1163 LD rWORD2, rOFF16, rSTR2 1164 cmpld cr5, rWORD7, rWORD8 1165 bne cr6, L(duLcr6) 1166 srd r0, rWORD2, rSHR 1167 sld rWORD2_SHIFT, rWORD2, rSHL 1168 or rWORD2, r0, rWORD8_SHIFT 1169 LD rWORD3, rOFF24, rSTR1 1170 LD rWORD4, rOFF24, rSTR2 1171 cmpld cr7, rWORD1, rWORD2 1172 bne cr5, L(duLcr5) 1173 srd r12, rWORD4, rSHR 1174 sld rWORD4_SHIFT, rWORD4, rSHL 1175 or rWORD4, r12, rWORD2_SHIFT 1176 addi rSTR1, rSTR1, 8 1177 addi rSTR2, rSTR2, 8 1178 cmpld cr1, rWORD3, rWORD4 1179 b L(duLoop2) 1180 .align 4 1181L(duP2x): 1182 cmpld cr5, rWORD7, rWORD8 1183 addi rSTR1, rSTR1, 8 1184 addi rSTR2, rSTR2, 8 1185 bne cr6, L(duLcr6) 1186 sldi. rN, rN, 3 1187 bne cr5, L(duLcr5) 1188 cmpld cr7, rN, rSHR 1189 beq L(duZeroReturn) 1190 li r0, 0 1191 ble cr7, L(dutrim) 1192 LD rWORD2, rOFF8, rSTR2 1193 srd r0, rWORD2, rSHR 1194 b L(dutrim) 1195 1196/* Remainder is 24. */ 1197 .align 4 1198L(duP3): 1199 srd r12, rWORD8, rSHR 1200 LD rWORD3, 0, rSTR1 1201 sld rWORD4_SHIFT, rWORD8, rSHL 1202 or rWORD4, r12, rWORD6_SHIFT 1203L(duP3e): 1204 LD rWORD5, rOFF8, rSTR1 1205 LD rWORD6, rOFF8, rSTR2 1206 cmpld cr1, rWORD3, rWORD4 1207 srd r0, rWORD6, rSHR 1208 sld rWORD6_SHIFT, rWORD6, rSHL 1209 or rWORD6, r0, rWORD4_SHIFT 1210 LD rWORD7, rOFF16, rSTR1 1211 LD rWORD8, rOFF16, rSTR2 1212 cmpld cr6, rWORD5, rWORD6 1213 bne cr1, L(duLcr1) 1214 srd r12, rWORD8, rSHR 1215 sld rWORD8_SHIFT, rWORD8, rSHL 1216 or rWORD8, r12, rWORD6_SHIFT 1217 blt cr7, L(duP3x) 1218 LD rWORD1, rOFF24, rSTR1 1219 LD rWORD2, rOFF24, rSTR2 1220 cmpld cr5, rWORD7, rWORD8 1221 bne cr6, L(duLcr6) 1222 srd r0, rWORD2, rSHR 1223 sld rWORD2_SHIFT, rWORD2, rSHL 1224 or rWORD2, r0, rWORD8_SHIFT 1225 addi rSTR1, rSTR1, 16 1226 addi rSTR2, rSTR2, 16 1227 cmpld cr7, rWORD1, rWORD2 1228 b L(duLoop1) 1229 .align 4 1230L(duP3x): 1231 addi rSTR1, rSTR1, 16 1232 addi rSTR2, rSTR2, 16 1233 cmpld cr5, rWORD7, rWORD8 1234 bne cr6, L(duLcr6) 1235 sldi. rN, rN, 3 1236 bne cr5, L(duLcr5) 1237 cmpld cr7, rN, rSHR 1238 beq L(duZeroReturn) 1239 li r0, 0 1240 ble cr7, L(dutrim) 1241 LD rWORD2, rOFF8, rSTR2 1242 srd r0, rWORD2, rSHR 1243 b L(dutrim) 1244 1245/* Count is a multiple of 32, remainder is 0. */ 1246 .align 4 1247L(duP4): 1248 mtctr r0 1249 srd r0, rWORD8, rSHR 1250 LD rWORD1, 0, rSTR1 1251 sld rWORD2_SHIFT, rWORD8, rSHL 1252 or rWORD2, r0, rWORD6_SHIFT 1253L(duP4e): 1254 LD rWORD3, rOFF8, rSTR1 1255 LD rWORD4, rOFF8, rSTR2 1256 cmpld cr7, rWORD1, rWORD2 1257 srd r12, rWORD4, rSHR 1258 sld rWORD4_SHIFT, rWORD4, rSHL 1259 or rWORD4, r12, rWORD2_SHIFT 1260 LD rWORD5, rOFF16, rSTR1 1261 LD rWORD6, rOFF16, rSTR2 1262 cmpld cr1, rWORD3, rWORD4 1263 bne cr7, L(duLcr7) 1264 srd r0, rWORD6, rSHR 1265 sld rWORD6_SHIFT, rWORD6, rSHL 1266 or rWORD6, r0, rWORD4_SHIFT 1267 LD rWORD7, rOFF24, rSTR1 1268 LD rWORD8, rOFF24, rSTR2 1269 addi rSTR1, rSTR1, 24 1270 addi rSTR2, rSTR2, 24 1271 cmpld cr6, rWORD5, rWORD6 1272 bne cr1, L(duLcr1) 1273 srd r12, rWORD8, rSHR 1274 sld rWORD8_SHIFT, rWORD8, rSHL 1275 or rWORD8, r12, rWORD6_SHIFT 1276 cmpld cr5, rWORD7, rWORD8 1277 bdz L(du24) /* Adjust CTR as we start with +4. */ 1278/* This is the primary loop. */ 1279 .align 4 1280L(duLoop): 1281 LD rWORD1, rOFF8, rSTR1 1282 LD rWORD2, rOFF8, rSTR2 1283 cmpld cr1, rWORD3, rWORD4 1284 bne cr6, L(duLcr6) 1285 srd r0, rWORD2, rSHR 1286 sld rWORD2_SHIFT, rWORD2, rSHL 1287 or rWORD2, r0, rWORD8_SHIFT 1288L(duLoop1): 1289 LD rWORD3, rOFF16, rSTR1 1290 LD rWORD4, rOFF16, rSTR2 1291 cmpld cr6, rWORD5, rWORD6 1292 bne cr5, L(duLcr5) 1293 srd r12, rWORD4, rSHR 1294 sld rWORD4_SHIFT, rWORD4, rSHL 1295 or rWORD4, r12, rWORD2_SHIFT 1296L(duLoop2): 1297 LD rWORD5, rOFF24, rSTR1 1298 LD rWORD6, rOFF24, rSTR2 1299 cmpld cr5, rWORD7, rWORD8 1300 bne cr7, L(duLcr7) 1301 srd r0, rWORD6, rSHR 1302 sld rWORD6_SHIFT, rWORD6, rSHL 1303 or rWORD6, r0, rWORD4_SHIFT 1304L(duLoop3): 1305 LD rWORD7, rOFF32, rSTR1 1306 LD rWORD8, rOFF32, rSTR2 1307 addi rSTR1, rSTR1, 32 1308 addi rSTR2, rSTR2, 32 1309 cmpld cr7, rWORD1, rWORD2 1310 bne cr1, L(duLcr1) 1311 srd r12, rWORD8, rSHR 1312 sld rWORD8_SHIFT, rWORD8, rSHL 1313 or rWORD8, r12, rWORD6_SHIFT 1314 bdnz L(duLoop) 1315 1316L(duL4): 1317 cmpld cr1, rWORD3, rWORD4 1318 bne cr6, L(duLcr6) 1319 cmpld cr6, rWORD5, rWORD6 1320 bne cr5, L(duLcr5) 1321 cmpld cr5, rWORD7, rWORD8 1322L(du44): 1323 bne cr7, L(duLcr7) 1324L(du34): 1325 bne cr1, L(duLcr1) 1326L(du24): 1327 bne cr6, L(duLcr6) 1328L(du14): 1329 sldi. rN, rN, 3 1330 bne cr5, L(duLcr5) 1331/* At this point we have a remainder of 1 to 7 bytes to compare. We use 1332 shift right double to eliminate bits beyond the compare length. 1333 1334 However it may not be safe to load rWORD2 which may be beyond the 1335 string length. So we compare the bit length of the remainder to 1336 the right shift count (rSHR). If the bit count is less than or equal 1337 we do not need to load rWORD2 (all significant bits are already in 1338 rWORD8_SHIFT). */ 1339 cmpld cr7, rN, rSHR 1340 beq L(duZeroReturn) 1341 li r0, 0 1342 ble cr7, L(dutrim) 1343 LD rWORD2, rOFF8, rSTR2 1344 srd r0, rWORD2, rSHR 1345 .align 4 1346L(dutrim): 1347 LD rWORD1, rOFF8, rSTR1 1348 ld rWORD8, -8(r1) 1349 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ 1350 or rWORD2, r0, rWORD8_SHIFT 1351 ld rWORD7, rWORD7SAVE(r1) 1352 ld rSHL, rSHLSAVE(r1) 1353 srd rWORD1, rWORD1, rN 1354 srd rWORD2, rWORD2, rN 1355 ld rSHR, rSHRSAVE(r1) 1356 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 1357 li rRTN, 0 1358 cmpld cr7, rWORD1, rWORD2 1359 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 1360 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 1361 beq cr7, L(dureturn24) 1362 li rRTN, 1 1363 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 1364 ld rOFF8, rOFF8SAVE(r1) 1365 ld rOFF16, rOFF16SAVE(r1) 1366 ld rOFF24, rOFF24SAVE(r1) 1367 ld rOFF32, rOFF32SAVE(r1) 1368 bgtlr cr7 1369 li rRTN, -1 1370 blr 1371 .align 4 1372L(duLcr7): 1373 ld rWORD8, rWORD8SAVE(r1) 1374 ld rWORD7, rWORD7SAVE(r1) 1375 li rRTN, 1 1376 bgt cr7, L(dureturn29) 1377 ld rSHL, rSHLSAVE(r1) 1378 ld rSHR, rSHRSAVE(r1) 1379 li rRTN, -1 1380 b L(dureturn27) 1381 .align 4 1382L(duLcr1): 1383 ld rWORD8, rWORD8SAVE(r1) 1384 ld rWORD7, rWORD7SAVE(r1) 1385 li rRTN, 1 1386 bgt cr1, L(dureturn29) 1387 ld rSHL, rSHLSAVE(r1) 1388 ld rSHR, rSHRSAVE(r1) 1389 li rRTN, -1 1390 b L(dureturn27) 1391 .align 4 1392L(duLcr6): 1393 ld rWORD8, rWORD8SAVE(r1) 1394 ld rWORD7, rWORD7SAVE(r1) 1395 li rRTN, 1 1396 bgt cr6, L(dureturn29) 1397 ld rSHL, rSHLSAVE(r1) 1398 ld rSHR, rSHRSAVE(r1) 1399 li rRTN, -1 1400 b L(dureturn27) 1401 .align 4 1402L(duLcr5): 1403 ld rWORD8, rWORD8SAVE(r1) 1404 ld rWORD7, rWORD7SAVE(r1) 1405 li rRTN, 1 1406 bgt cr5, L(dureturn29) 1407 ld rSHL, rSHLSAVE(r1) 1408 ld rSHR, rSHRSAVE(r1) 1409 li rRTN, -1 1410 b L(dureturn27) 1411 1412 .align 3 1413L(duZeroReturn): 1414 li rRTN, 0 1415 .align 4 1416L(dureturn): 1417 ld rWORD8, rWORD8SAVE(r1) 1418 ld rWORD7, rWORD7SAVE(r1) 1419L(dureturn29): 1420 ld rSHL, rSHLSAVE(r1) 1421 ld rSHR, rSHRSAVE(r1) 1422L(dureturn27): 1423 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 1424 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 1425 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 1426L(dureturn24): 1427 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 1428 ld rOFF8, rOFF8SAVE(r1) 1429 ld rOFF16, rOFF16SAVE(r1) 1430 ld rOFF24, rOFF24SAVE(r1) 1431 ld rOFF32, rOFF32SAVE(r1) 1432 blr 1433 1434L(duzeroLength): 1435 ld rOFF8, rOFF8SAVE(r1) 1436 ld rOFF16, rOFF16SAVE(r1) 1437 ld rOFF24, rOFF24SAVE(r1) 1438 ld rOFF32, rOFF32SAVE(r1) 1439 li rRTN, 0 1440 blr 1441 1442END (MEMCMP) 1443libc_hidden_builtin_def (memcmp) 1444weak_alias (memcmp, bcmp) 1445strong_alias (memcmp, __memcmpeq) 1446libc_hidden_def (__memcmpeq) 1447