1/* memcmp with SSSE3, wmemcmp with SSSE3 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMCMP 24# define MEMCMP __memcmp_ssse3 25# endif 26 27/* Warning! 28 wmemcmp has to use SIGNED comparison for elements. 29 memcmp has to use UNSIGNED comparison for elemnts. 30*/ 31 32 atom_text_section 33ENTRY (MEMCMP) 34# ifdef USE_AS_WMEMCMP 35 shl $2, %RDX_LP 36 test %RDX_LP, %RDX_LP 37 jz L(equal) 38# elif defined __ILP32__ 39 /* Clear the upper 32 bits. */ 40 mov %edx, %edx 41# endif 42 mov %rdx, %rcx 43 mov %rdi, %rdx 44 cmp $48, %rcx; 45 jae L(48bytesormore) /* LEN => 48 */ 46 47 add %rcx, %rsi 48 add %rcx, %rdi 49 jmp L(less48bytes) 50 51 .p2align 4 52/* ECX >= 32. */ 53L(48bytesormore): 54 movdqu (%rdi), %xmm3 55 movdqu (%rsi), %xmm0 56 pcmpeqb %xmm0, %xmm3 57 pmovmskb %xmm3, %edx 58 lea 16(%rdi), %rdi 59 lea 16(%rsi), %rsi 60 sub $0xffff, %edx 61 jnz L(less16bytes) 62 mov %edi, %edx 63 and $0xf, %edx 64 xor %rdx, %rdi 65 sub %rdx, %rsi 66 add %rdx, %rcx 67 mov %esi, %edx 68 and $0xf, %edx 69 jz L(shr_0) 70 xor %rdx, %rsi 71 72# ifndef USE_AS_WMEMCMP 73 cmp $8, %edx 74 jae L(next_unaligned_table) 75 cmp $0, %edx 76 je L(shr_0) 77 cmp $1, %edx 78 je L(shr_1) 79 cmp $2, %edx 80 je L(shr_2) 81 cmp $3, %edx 82 je L(shr_3) 83 cmp $4, %edx 84 je L(shr_4) 85 cmp $5, %edx 86 je L(shr_5) 87 cmp $6, %edx 88 je L(shr_6) 89 jmp L(shr_7) 90 91 .p2align 2 92L(next_unaligned_table): 93 cmp $8, %edx 94 je L(shr_8) 95 cmp $9, %edx 96 je L(shr_9) 97 cmp $10, %edx 98 je L(shr_10) 99 cmp $11, %edx 100 je L(shr_11) 101 cmp $12, %edx 102 je L(shr_12) 103 cmp $13, %edx 104 je L(shr_13) 105 cmp $14, %edx 106 je L(shr_14) 107 jmp L(shr_15) 108# else 109 cmp $0, %edx 110 je L(shr_0) 111 cmp $4, %edx 112 je L(shr_4) 113 cmp $8, %edx 114 je L(shr_8) 115 jmp L(shr_12) 116# endif 117 118 .p2align 4 119L(shr_0): 120 cmp $80, %rcx 121 lea -48(%rcx), %rcx 122 jae L(shr_0_gobble) 123 xor %eax, %eax 124 movdqa (%rsi), %xmm1 125 pcmpeqb (%rdi), %xmm1 126 movdqa 16(%rsi), %xmm2 127 pcmpeqb 16(%rdi), %xmm2 128 pand %xmm1, %xmm2 129 pmovmskb %xmm2, %edx 130 lea 32(%rdi), %rdi 131 lea 32(%rsi), %rsi 132 sub $0xffff, %edx 133 jnz L(exit) 134 add %rcx, %rsi 135 add %rcx, %rdi 136 jmp L(less48bytes) 137 138 .p2align 4 139L(shr_0_gobble): 140 movdqa (%rsi), %xmm0 141 xor %eax, %eax 142 pcmpeqb (%rdi), %xmm0 143 sub $32, %rcx 144 movdqa 16(%rsi), %xmm2 145 pcmpeqb 16(%rdi), %xmm2 146L(shr_0_gobble_loop): 147 pand %xmm0, %xmm2 148 sub $32, %rcx 149 pmovmskb %xmm2, %edx 150 movdqa %xmm0, %xmm1 151 movdqa 32(%rsi), %xmm0 152 movdqa 48(%rsi), %xmm2 153 sbb $0xffff, %edx 154 pcmpeqb 32(%rdi), %xmm0 155 pcmpeqb 48(%rdi), %xmm2 156 lea 32(%rdi), %rdi 157 lea 32(%rsi), %rsi 158 jz L(shr_0_gobble_loop) 159 160 pand %xmm0, %xmm2 161 cmp $0, %rcx 162 jge L(next) 163 inc %edx 164 add $32, %rcx 165L(next): 166 test %edx, %edx 167 jnz L(exit) 168 169 pmovmskb %xmm2, %edx 170 movdqa %xmm0, %xmm1 171 lea 32(%rdi), %rdi 172 lea 32(%rsi), %rsi 173 sub $0xffff, %edx 174 jnz L(exit) 175 add %rcx, %rsi 176 add %rcx, %rdi 177 jmp L(less48bytes) 178 179# ifndef USE_AS_WMEMCMP 180 181 .p2align 4 182L(shr_1): 183 cmp $80, %rcx 184 lea -48(%rcx), %rcx 185 mov %edx, %eax 186 jae L(shr_1_gobble) 187 188 movdqa 16(%rsi), %xmm1 189 movdqa %xmm1, %xmm2 190 palignr $1, (%rsi), %xmm1 191 pcmpeqb (%rdi), %xmm1 192 193 movdqa 32(%rsi), %xmm3 194 palignr $1, %xmm2, %xmm3 195 pcmpeqb 16(%rdi), %xmm3 196 197 pand %xmm1, %xmm3 198 pmovmskb %xmm3, %edx 199 lea 32(%rdi), %rdi 200 lea 32(%rsi), %rsi 201 sub $0xffff, %edx 202 jnz L(exit) 203 add $1, %rsi 204 add %rcx, %rsi 205 add %rcx, %rdi 206 jmp L(less48bytes) 207 208 .p2align 4 209L(shr_1_gobble): 210 sub $32, %rcx 211 movdqa 16(%rsi), %xmm0 212 palignr $1, (%rsi), %xmm0 213 pcmpeqb (%rdi), %xmm0 214 215 movdqa 32(%rsi), %xmm3 216 palignr $1, 16(%rsi), %xmm3 217 pcmpeqb 16(%rdi), %xmm3 218 219L(shr_1_gobble_loop): 220 pand %xmm0, %xmm3 221 sub $32, %rcx 222 pmovmskb %xmm3, %edx 223 movdqa %xmm0, %xmm1 224 225 movdqa 64(%rsi), %xmm3 226 palignr $1, 48(%rsi), %xmm3 227 sbb $0xffff, %edx 228 movdqa 48(%rsi), %xmm0 229 palignr $1, 32(%rsi), %xmm0 230 pcmpeqb 32(%rdi), %xmm0 231 lea 32(%rsi), %rsi 232 pcmpeqb 48(%rdi), %xmm3 233 234 lea 32(%rdi), %rdi 235 jz L(shr_1_gobble_loop) 236 pand %xmm0, %xmm3 237 238 cmp $0, %rcx 239 jge L(shr_1_gobble_next) 240 inc %edx 241 add $32, %rcx 242L(shr_1_gobble_next): 243 test %edx, %edx 244 jnz L(exit) 245 246 pmovmskb %xmm3, %edx 247 movdqa %xmm0, %xmm1 248 lea 32(%rdi), %rdi 249 lea 32(%rsi), %rsi 250 sub $0xffff, %edx 251 jnz L(exit) 252 253 lea 1(%rsi), %rsi 254 add %rcx, %rsi 255 add %rcx, %rdi 256 jmp L(less48bytes) 257 258 259 .p2align 4 260L(shr_2): 261 cmp $80, %rcx 262 lea -48(%rcx), %rcx 263 mov %edx, %eax 264 jae L(shr_2_gobble) 265 266 movdqa 16(%rsi), %xmm1 267 movdqa %xmm1, %xmm2 268 palignr $2, (%rsi), %xmm1 269 pcmpeqb (%rdi), %xmm1 270 271 movdqa 32(%rsi), %xmm3 272 palignr $2, %xmm2, %xmm3 273 pcmpeqb 16(%rdi), %xmm3 274 275 pand %xmm1, %xmm3 276 pmovmskb %xmm3, %edx 277 lea 32(%rdi), %rdi 278 lea 32(%rsi), %rsi 279 sub $0xffff, %edx 280 jnz L(exit) 281 add $2, %rsi 282 add %rcx, %rsi 283 add %rcx, %rdi 284 jmp L(less48bytes) 285 286 .p2align 4 287L(shr_2_gobble): 288 sub $32, %rcx 289 movdqa 16(%rsi), %xmm0 290 palignr $2, (%rsi), %xmm0 291 pcmpeqb (%rdi), %xmm0 292 293 movdqa 32(%rsi), %xmm3 294 palignr $2, 16(%rsi), %xmm3 295 pcmpeqb 16(%rdi), %xmm3 296 297L(shr_2_gobble_loop): 298 pand %xmm0, %xmm3 299 sub $32, %rcx 300 pmovmskb %xmm3, %edx 301 movdqa %xmm0, %xmm1 302 303 movdqa 64(%rsi), %xmm3 304 palignr $2, 48(%rsi), %xmm3 305 sbb $0xffff, %edx 306 movdqa 48(%rsi), %xmm0 307 palignr $2, 32(%rsi), %xmm0 308 pcmpeqb 32(%rdi), %xmm0 309 lea 32(%rsi), %rsi 310 pcmpeqb 48(%rdi), %xmm3 311 312 lea 32(%rdi), %rdi 313 jz L(shr_2_gobble_loop) 314 pand %xmm0, %xmm3 315 316 cmp $0, %rcx 317 jge L(shr_2_gobble_next) 318 inc %edx 319 add $32, %rcx 320L(shr_2_gobble_next): 321 test %edx, %edx 322 jnz L(exit) 323 324 pmovmskb %xmm3, %edx 325 movdqa %xmm0, %xmm1 326 lea 32(%rdi), %rdi 327 lea 32(%rsi), %rsi 328 sub $0xffff, %edx 329 jnz L(exit) 330 331 lea 2(%rsi), %rsi 332 add %rcx, %rsi 333 add %rcx, %rdi 334 jmp L(less48bytes) 335 336 .p2align 4 337L(shr_3): 338 cmp $80, %rcx 339 lea -48(%rcx), %rcx 340 mov %edx, %eax 341 jae L(shr_3_gobble) 342 343 movdqa 16(%rsi), %xmm1 344 movdqa %xmm1, %xmm2 345 palignr $3, (%rsi), %xmm1 346 pcmpeqb (%rdi), %xmm1 347 348 movdqa 32(%rsi), %xmm3 349 palignr $3, %xmm2, %xmm3 350 pcmpeqb 16(%rdi), %xmm3 351 352 pand %xmm1, %xmm3 353 pmovmskb %xmm3, %edx 354 lea 32(%rdi), %rdi 355 lea 32(%rsi), %rsi 356 sub $0xffff, %edx 357 jnz L(exit) 358 add $3, %rsi 359 add %rcx, %rsi 360 add %rcx, %rdi 361 jmp L(less48bytes) 362 363 .p2align 4 364L(shr_3_gobble): 365 sub $32, %rcx 366 movdqa 16(%rsi), %xmm0 367 palignr $3, (%rsi), %xmm0 368 pcmpeqb (%rdi), %xmm0 369 370 movdqa 32(%rsi), %xmm3 371 palignr $3, 16(%rsi), %xmm3 372 pcmpeqb 16(%rdi), %xmm3 373 374L(shr_3_gobble_loop): 375 pand %xmm0, %xmm3 376 sub $32, %rcx 377 pmovmskb %xmm3, %edx 378 movdqa %xmm0, %xmm1 379 380 movdqa 64(%rsi), %xmm3 381 palignr $3, 48(%rsi), %xmm3 382 sbb $0xffff, %edx 383 movdqa 48(%rsi), %xmm0 384 palignr $3, 32(%rsi), %xmm0 385 pcmpeqb 32(%rdi), %xmm0 386 lea 32(%rsi), %rsi 387 pcmpeqb 48(%rdi), %xmm3 388 389 lea 32(%rdi), %rdi 390 jz L(shr_3_gobble_loop) 391 pand %xmm0, %xmm3 392 393 cmp $0, %rcx 394 jge L(shr_3_gobble_next) 395 inc %edx 396 add $32, %rcx 397L(shr_3_gobble_next): 398 test %edx, %edx 399 jnz L(exit) 400 401 pmovmskb %xmm3, %edx 402 movdqa %xmm0, %xmm1 403 lea 32(%rdi), %rdi 404 lea 32(%rsi), %rsi 405 sub $0xffff, %edx 406 jnz L(exit) 407 408 lea 3(%rsi), %rsi 409 add %rcx, %rsi 410 add %rcx, %rdi 411 jmp L(less48bytes) 412 413# endif 414 415 .p2align 4 416L(shr_4): 417 cmp $80, %rcx 418 lea -48(%rcx), %rcx 419 mov %edx, %eax 420 jae L(shr_4_gobble) 421 422 movdqa 16(%rsi), %xmm1 423 movdqa %xmm1, %xmm2 424 palignr $4, (%rsi), %xmm1 425 pcmpeqb (%rdi), %xmm1 426 427 movdqa 32(%rsi), %xmm3 428 palignr $4, %xmm2, %xmm3 429 pcmpeqb 16(%rdi), %xmm3 430 431 pand %xmm1, %xmm3 432 pmovmskb %xmm3, %edx 433 lea 32(%rdi), %rdi 434 lea 32(%rsi), %rsi 435 sub $0xffff, %edx 436 jnz L(exit) 437 add $4, %rsi 438 add %rcx, %rsi 439 add %rcx, %rdi 440 jmp L(less48bytes) 441 442 .p2align 4 443L(shr_4_gobble): 444 sub $32, %rcx 445 movdqa 16(%rsi), %xmm0 446 palignr $4, (%rsi), %xmm0 447 pcmpeqb (%rdi), %xmm0 448 449 movdqa 32(%rsi), %xmm3 450 palignr $4, 16(%rsi), %xmm3 451 pcmpeqb 16(%rdi), %xmm3 452 453L(shr_4_gobble_loop): 454 pand %xmm0, %xmm3 455 sub $32, %rcx 456 pmovmskb %xmm3, %edx 457 movdqa %xmm0, %xmm1 458 459 movdqa 64(%rsi), %xmm3 460 palignr $4, 48(%rsi), %xmm3 461 sbb $0xffff, %edx 462 movdqa 48(%rsi), %xmm0 463 palignr $4, 32(%rsi), %xmm0 464 pcmpeqb 32(%rdi), %xmm0 465 lea 32(%rsi), %rsi 466 pcmpeqb 48(%rdi), %xmm3 467 468 lea 32(%rdi), %rdi 469 jz L(shr_4_gobble_loop) 470 pand %xmm0, %xmm3 471 472 cmp $0, %rcx 473 jge L(shr_4_gobble_next) 474 inc %edx 475 add $32, %rcx 476L(shr_4_gobble_next): 477 test %edx, %edx 478 jnz L(exit) 479 480 pmovmskb %xmm3, %edx 481 movdqa %xmm0, %xmm1 482 lea 32(%rdi), %rdi 483 lea 32(%rsi), %rsi 484 sub $0xffff, %edx 485 jnz L(exit) 486 487 lea 4(%rsi), %rsi 488 add %rcx, %rsi 489 add %rcx, %rdi 490 jmp L(less48bytes) 491 492# ifndef USE_AS_WMEMCMP 493 494 .p2align 4 495L(shr_5): 496 cmp $80, %rcx 497 lea -48(%rcx), %rcx 498 mov %edx, %eax 499 jae L(shr_5_gobble) 500 501 movdqa 16(%rsi), %xmm1 502 movdqa %xmm1, %xmm2 503 palignr $5, (%rsi), %xmm1 504 pcmpeqb (%rdi), %xmm1 505 506 movdqa 32(%rsi), %xmm3 507 palignr $5, %xmm2, %xmm3 508 pcmpeqb 16(%rdi), %xmm3 509 510 pand %xmm1, %xmm3 511 pmovmskb %xmm3, %edx 512 lea 32(%rdi), %rdi 513 lea 32(%rsi), %rsi 514 sub $0xffff, %edx 515 jnz L(exit) 516 add $5, %rsi 517 add %rcx, %rsi 518 add %rcx, %rdi 519 jmp L(less48bytes) 520 521 .p2align 4 522L(shr_5_gobble): 523 sub $32, %rcx 524 movdqa 16(%rsi), %xmm0 525 palignr $5, (%rsi), %xmm0 526 pcmpeqb (%rdi), %xmm0 527 528 movdqa 32(%rsi), %xmm3 529 palignr $5, 16(%rsi), %xmm3 530 pcmpeqb 16(%rdi), %xmm3 531 532L(shr_5_gobble_loop): 533 pand %xmm0, %xmm3 534 sub $32, %rcx 535 pmovmskb %xmm3, %edx 536 movdqa %xmm0, %xmm1 537 538 movdqa 64(%rsi), %xmm3 539 palignr $5, 48(%rsi), %xmm3 540 sbb $0xffff, %edx 541 movdqa 48(%rsi), %xmm0 542 palignr $5, 32(%rsi), %xmm0 543 pcmpeqb 32(%rdi), %xmm0 544 lea 32(%rsi), %rsi 545 pcmpeqb 48(%rdi), %xmm3 546 547 lea 32(%rdi), %rdi 548 jz L(shr_5_gobble_loop) 549 pand %xmm0, %xmm3 550 551 cmp $0, %rcx 552 jge L(shr_5_gobble_next) 553 inc %edx 554 add $32, %rcx 555L(shr_5_gobble_next): 556 test %edx, %edx 557 jnz L(exit) 558 559 pmovmskb %xmm3, %edx 560 movdqa %xmm0, %xmm1 561 lea 32(%rdi), %rdi 562 lea 32(%rsi), %rsi 563 sub $0xffff, %edx 564 jnz L(exit) 565 566 lea 5(%rsi), %rsi 567 add %rcx, %rsi 568 add %rcx, %rdi 569 jmp L(less48bytes) 570 571 .p2align 4 572L(shr_6): 573 cmp $80, %rcx 574 lea -48(%rcx), %rcx 575 mov %edx, %eax 576 jae L(shr_6_gobble) 577 578 movdqa 16(%rsi), %xmm1 579 movdqa %xmm1, %xmm2 580 palignr $6, (%rsi), %xmm1 581 pcmpeqb (%rdi), %xmm1 582 583 movdqa 32(%rsi), %xmm3 584 palignr $6, %xmm2, %xmm3 585 pcmpeqb 16(%rdi), %xmm3 586 587 pand %xmm1, %xmm3 588 pmovmskb %xmm3, %edx 589 lea 32(%rdi), %rdi 590 lea 32(%rsi), %rsi 591 sub $0xffff, %edx 592 jnz L(exit) 593 add $6, %rsi 594 add %rcx, %rsi 595 add %rcx, %rdi 596 jmp L(less48bytes) 597 598 .p2align 4 599L(shr_6_gobble): 600 sub $32, %rcx 601 movdqa 16(%rsi), %xmm0 602 palignr $6, (%rsi), %xmm0 603 pcmpeqb (%rdi), %xmm0 604 605 movdqa 32(%rsi), %xmm3 606 palignr $6, 16(%rsi), %xmm3 607 pcmpeqb 16(%rdi), %xmm3 608 609L(shr_6_gobble_loop): 610 pand %xmm0, %xmm3 611 sub $32, %rcx 612 pmovmskb %xmm3, %edx 613 movdqa %xmm0, %xmm1 614 615 movdqa 64(%rsi), %xmm3 616 palignr $6, 48(%rsi), %xmm3 617 sbb $0xffff, %edx 618 movdqa 48(%rsi), %xmm0 619 palignr $6, 32(%rsi), %xmm0 620 pcmpeqb 32(%rdi), %xmm0 621 lea 32(%rsi), %rsi 622 pcmpeqb 48(%rdi), %xmm3 623 624 lea 32(%rdi), %rdi 625 jz L(shr_6_gobble_loop) 626 pand %xmm0, %xmm3 627 628 cmp $0, %rcx 629 jge L(shr_6_gobble_next) 630 inc %edx 631 add $32, %rcx 632L(shr_6_gobble_next): 633 test %edx, %edx 634 jnz L(exit) 635 636 pmovmskb %xmm3, %edx 637 movdqa %xmm0, %xmm1 638 lea 32(%rdi), %rdi 639 lea 32(%rsi), %rsi 640 sub $0xffff, %edx 641 jnz L(exit) 642 643 lea 6(%rsi), %rsi 644 add %rcx, %rsi 645 add %rcx, %rdi 646 jmp L(less48bytes) 647 648 .p2align 4 649L(shr_7): 650 cmp $80, %rcx 651 lea -48(%rcx), %rcx 652 mov %edx, %eax 653 jae L(shr_7_gobble) 654 655 movdqa 16(%rsi), %xmm1 656 movdqa %xmm1, %xmm2 657 palignr $7, (%rsi), %xmm1 658 pcmpeqb (%rdi), %xmm1 659 660 movdqa 32(%rsi), %xmm3 661 palignr $7, %xmm2, %xmm3 662 pcmpeqb 16(%rdi), %xmm3 663 664 pand %xmm1, %xmm3 665 pmovmskb %xmm3, %edx 666 lea 32(%rdi), %rdi 667 lea 32(%rsi), %rsi 668 sub $0xffff, %edx 669 jnz L(exit) 670 add $7, %rsi 671 add %rcx, %rsi 672 add %rcx, %rdi 673 jmp L(less48bytes) 674 675 .p2align 4 676L(shr_7_gobble): 677 sub $32, %rcx 678 movdqa 16(%rsi), %xmm0 679 palignr $7, (%rsi), %xmm0 680 pcmpeqb (%rdi), %xmm0 681 682 movdqa 32(%rsi), %xmm3 683 palignr $7, 16(%rsi), %xmm3 684 pcmpeqb 16(%rdi), %xmm3 685 686L(shr_7_gobble_loop): 687 pand %xmm0, %xmm3 688 sub $32, %rcx 689 pmovmskb %xmm3, %edx 690 movdqa %xmm0, %xmm1 691 692 movdqa 64(%rsi), %xmm3 693 palignr $7, 48(%rsi), %xmm3 694 sbb $0xffff, %edx 695 movdqa 48(%rsi), %xmm0 696 palignr $7, 32(%rsi), %xmm0 697 pcmpeqb 32(%rdi), %xmm0 698 lea 32(%rsi), %rsi 699 pcmpeqb 48(%rdi), %xmm3 700 701 lea 32(%rdi), %rdi 702 jz L(shr_7_gobble_loop) 703 pand %xmm0, %xmm3 704 705 cmp $0, %rcx 706 jge L(shr_7_gobble_next) 707 inc %edx 708 add $32, %rcx 709L(shr_7_gobble_next): 710 test %edx, %edx 711 jnz L(exit) 712 713 pmovmskb %xmm3, %edx 714 movdqa %xmm0, %xmm1 715 lea 32(%rdi), %rdi 716 lea 32(%rsi), %rsi 717 sub $0xffff, %edx 718 jnz L(exit) 719 720 lea 7(%rsi), %rsi 721 add %rcx, %rsi 722 add %rcx, %rdi 723 jmp L(less48bytes) 724 725# endif 726 727 .p2align 4 728L(shr_8): 729 cmp $80, %rcx 730 lea -48(%rcx), %rcx 731 mov %edx, %eax 732 jae L(shr_8_gobble) 733 734 movdqa 16(%rsi), %xmm1 735 movdqa %xmm1, %xmm2 736 palignr $8, (%rsi), %xmm1 737 pcmpeqb (%rdi), %xmm1 738 739 movdqa 32(%rsi), %xmm3 740 palignr $8, %xmm2, %xmm3 741 pcmpeqb 16(%rdi), %xmm3 742 743 pand %xmm1, %xmm3 744 pmovmskb %xmm3, %edx 745 lea 32(%rdi), %rdi 746 lea 32(%rsi), %rsi 747 sub $0xffff, %edx 748 jnz L(exit) 749 add $8, %rsi 750 add %rcx, %rsi 751 add %rcx, %rdi 752 jmp L(less48bytes) 753 754 .p2align 4 755L(shr_8_gobble): 756 sub $32, %rcx 757 movdqa 16(%rsi), %xmm0 758 palignr $8, (%rsi), %xmm0 759 pcmpeqb (%rdi), %xmm0 760 761 movdqa 32(%rsi), %xmm3 762 palignr $8, 16(%rsi), %xmm3 763 pcmpeqb 16(%rdi), %xmm3 764 765L(shr_8_gobble_loop): 766 pand %xmm0, %xmm3 767 sub $32, %rcx 768 pmovmskb %xmm3, %edx 769 movdqa %xmm0, %xmm1 770 771 movdqa 64(%rsi), %xmm3 772 palignr $8, 48(%rsi), %xmm3 773 sbb $0xffff, %edx 774 movdqa 48(%rsi), %xmm0 775 palignr $8, 32(%rsi), %xmm0 776 pcmpeqb 32(%rdi), %xmm0 777 lea 32(%rsi), %rsi 778 pcmpeqb 48(%rdi), %xmm3 779 780 lea 32(%rdi), %rdi 781 jz L(shr_8_gobble_loop) 782 pand %xmm0, %xmm3 783 784 cmp $0, %rcx 785 jge L(shr_8_gobble_next) 786 inc %edx 787 add $32, %rcx 788L(shr_8_gobble_next): 789 test %edx, %edx 790 jnz L(exit) 791 792 pmovmskb %xmm3, %edx 793 movdqa %xmm0, %xmm1 794 lea 32(%rdi), %rdi 795 lea 32(%rsi), %rsi 796 sub $0xffff, %edx 797 jnz L(exit) 798 799 lea 8(%rsi), %rsi 800 add %rcx, %rsi 801 add %rcx, %rdi 802 jmp L(less48bytes) 803 804# ifndef USE_AS_WMEMCMP 805 806 .p2align 4 807L(shr_9): 808 cmp $80, %rcx 809 lea -48(%rcx), %rcx 810 mov %edx, %eax 811 jae L(shr_9_gobble) 812 813 movdqa 16(%rsi), %xmm1 814 movdqa %xmm1, %xmm2 815 palignr $9, (%rsi), %xmm1 816 pcmpeqb (%rdi), %xmm1 817 818 movdqa 32(%rsi), %xmm3 819 palignr $9, %xmm2, %xmm3 820 pcmpeqb 16(%rdi), %xmm3 821 822 pand %xmm1, %xmm3 823 pmovmskb %xmm3, %edx 824 lea 32(%rdi), %rdi 825 lea 32(%rsi), %rsi 826 sub $0xffff, %edx 827 jnz L(exit) 828 add $9, %rsi 829 add %rcx, %rsi 830 add %rcx, %rdi 831 jmp L(less48bytes) 832 833 .p2align 4 834L(shr_9_gobble): 835 sub $32, %rcx 836 movdqa 16(%rsi), %xmm0 837 palignr $9, (%rsi), %xmm0 838 pcmpeqb (%rdi), %xmm0 839 840 movdqa 32(%rsi), %xmm3 841 palignr $9, 16(%rsi), %xmm3 842 pcmpeqb 16(%rdi), %xmm3 843 844L(shr_9_gobble_loop): 845 pand %xmm0, %xmm3 846 sub $32, %rcx 847 pmovmskb %xmm3, %edx 848 movdqa %xmm0, %xmm1 849 850 movdqa 64(%rsi), %xmm3 851 palignr $9, 48(%rsi), %xmm3 852 sbb $0xffff, %edx 853 movdqa 48(%rsi), %xmm0 854 palignr $9, 32(%rsi), %xmm0 855 pcmpeqb 32(%rdi), %xmm0 856 lea 32(%rsi), %rsi 857 pcmpeqb 48(%rdi), %xmm3 858 859 lea 32(%rdi), %rdi 860 jz L(shr_9_gobble_loop) 861 pand %xmm0, %xmm3 862 863 cmp $0, %rcx 864 jge L(shr_9_gobble_next) 865 inc %edx 866 add $32, %rcx 867L(shr_9_gobble_next): 868 test %edx, %edx 869 jnz L(exit) 870 871 pmovmskb %xmm3, %edx 872 movdqa %xmm0, %xmm1 873 lea 32(%rdi), %rdi 874 lea 32(%rsi), %rsi 875 sub $0xffff, %edx 876 jnz L(exit) 877 878 lea 9(%rsi), %rsi 879 add %rcx, %rsi 880 add %rcx, %rdi 881 jmp L(less48bytes) 882 883 .p2align 4 884L(shr_10): 885 cmp $80, %rcx 886 lea -48(%rcx), %rcx 887 mov %edx, %eax 888 jae L(shr_10_gobble) 889 890 movdqa 16(%rsi), %xmm1 891 movdqa %xmm1, %xmm2 892 palignr $10, (%rsi), %xmm1 893 pcmpeqb (%rdi), %xmm1 894 895 movdqa 32(%rsi), %xmm3 896 palignr $10, %xmm2, %xmm3 897 pcmpeqb 16(%rdi), %xmm3 898 899 pand %xmm1, %xmm3 900 pmovmskb %xmm3, %edx 901 lea 32(%rdi), %rdi 902 lea 32(%rsi), %rsi 903 sub $0xffff, %edx 904 jnz L(exit) 905 add $10, %rsi 906 add %rcx, %rsi 907 add %rcx, %rdi 908 jmp L(less48bytes) 909 910 .p2align 4 911L(shr_10_gobble): 912 sub $32, %rcx 913 movdqa 16(%rsi), %xmm0 914 palignr $10, (%rsi), %xmm0 915 pcmpeqb (%rdi), %xmm0 916 917 movdqa 32(%rsi), %xmm3 918 palignr $10, 16(%rsi), %xmm3 919 pcmpeqb 16(%rdi), %xmm3 920 921L(shr_10_gobble_loop): 922 pand %xmm0, %xmm3 923 sub $32, %rcx 924 pmovmskb %xmm3, %edx 925 movdqa %xmm0, %xmm1 926 927 movdqa 64(%rsi), %xmm3 928 palignr $10, 48(%rsi), %xmm3 929 sbb $0xffff, %edx 930 movdqa 48(%rsi), %xmm0 931 palignr $10, 32(%rsi), %xmm0 932 pcmpeqb 32(%rdi), %xmm0 933 lea 32(%rsi), %rsi 934 pcmpeqb 48(%rdi), %xmm3 935 936 lea 32(%rdi), %rdi 937 jz L(shr_10_gobble_loop) 938 pand %xmm0, %xmm3 939 940 cmp $0, %rcx 941 jge L(shr_10_gobble_next) 942 inc %edx 943 add $32, %rcx 944L(shr_10_gobble_next): 945 test %edx, %edx 946 jnz L(exit) 947 948 pmovmskb %xmm3, %edx 949 movdqa %xmm0, %xmm1 950 lea 32(%rdi), %rdi 951 lea 32(%rsi), %rsi 952 sub $0xffff, %edx 953 jnz L(exit) 954 955 lea 10(%rsi), %rsi 956 add %rcx, %rsi 957 add %rcx, %rdi 958 jmp L(less48bytes) 959 960 .p2align 4 961L(shr_11): 962 cmp $80, %rcx 963 lea -48(%rcx), %rcx 964 mov %edx, %eax 965 jae L(shr_11_gobble) 966 967 movdqa 16(%rsi), %xmm1 968 movdqa %xmm1, %xmm2 969 palignr $11, (%rsi), %xmm1 970 pcmpeqb (%rdi), %xmm1 971 972 movdqa 32(%rsi), %xmm3 973 palignr $11, %xmm2, %xmm3 974 pcmpeqb 16(%rdi), %xmm3 975 976 pand %xmm1, %xmm3 977 pmovmskb %xmm3, %edx 978 lea 32(%rdi), %rdi 979 lea 32(%rsi), %rsi 980 sub $0xffff, %edx 981 jnz L(exit) 982 add $11, %rsi 983 add %rcx, %rsi 984 add %rcx, %rdi 985 jmp L(less48bytes) 986 987 .p2align 4 988L(shr_11_gobble): 989 sub $32, %rcx 990 movdqa 16(%rsi), %xmm0 991 palignr $11, (%rsi), %xmm0 992 pcmpeqb (%rdi), %xmm0 993 994 movdqa 32(%rsi), %xmm3 995 palignr $11, 16(%rsi), %xmm3 996 pcmpeqb 16(%rdi), %xmm3 997 998L(shr_11_gobble_loop): 999 pand %xmm0, %xmm3 1000 sub $32, %rcx 1001 pmovmskb %xmm3, %edx 1002 movdqa %xmm0, %xmm1 1003 1004 movdqa 64(%rsi), %xmm3 1005 palignr $11, 48(%rsi), %xmm3 1006 sbb $0xffff, %edx 1007 movdqa 48(%rsi), %xmm0 1008 palignr $11, 32(%rsi), %xmm0 1009 pcmpeqb 32(%rdi), %xmm0 1010 lea 32(%rsi), %rsi 1011 pcmpeqb 48(%rdi), %xmm3 1012 1013 lea 32(%rdi), %rdi 1014 jz L(shr_11_gobble_loop) 1015 pand %xmm0, %xmm3 1016 1017 cmp $0, %rcx 1018 jge L(shr_11_gobble_next) 1019 inc %edx 1020 add $32, %rcx 1021L(shr_11_gobble_next): 1022 test %edx, %edx 1023 jnz L(exit) 1024 1025 pmovmskb %xmm3, %edx 1026 movdqa %xmm0, %xmm1 1027 lea 32(%rdi), %rdi 1028 lea 32(%rsi), %rsi 1029 sub $0xffff, %edx 1030 jnz L(exit) 1031 1032 lea 11(%rsi), %rsi 1033 add %rcx, %rsi 1034 add %rcx, %rdi 1035 jmp L(less48bytes) 1036 1037# endif 1038 1039 .p2align 4 1040L(shr_12): 1041 cmp $80, %rcx 1042 lea -48(%rcx), %rcx 1043 mov %edx, %eax 1044 jae L(shr_12_gobble) 1045 1046 movdqa 16(%rsi), %xmm1 1047 movdqa %xmm1, %xmm2 1048 palignr $12, (%rsi), %xmm1 1049 pcmpeqb (%rdi), %xmm1 1050 1051 movdqa 32(%rsi), %xmm3 1052 palignr $12, %xmm2, %xmm3 1053 pcmpeqb 16(%rdi), %xmm3 1054 1055 pand %xmm1, %xmm3 1056 pmovmskb %xmm3, %edx 1057 lea 32(%rdi), %rdi 1058 lea 32(%rsi), %rsi 1059 sub $0xffff, %edx 1060 jnz L(exit) 1061 add $12, %rsi 1062 add %rcx, %rsi 1063 add %rcx, %rdi 1064 jmp L(less48bytes) 1065 1066 .p2align 4 1067L(shr_12_gobble): 1068 sub $32, %rcx 1069 movdqa 16(%rsi), %xmm0 1070 palignr $12, (%rsi), %xmm0 1071 pcmpeqb (%rdi), %xmm0 1072 1073 movdqa 32(%rsi), %xmm3 1074 palignr $12, 16(%rsi), %xmm3 1075 pcmpeqb 16(%rdi), %xmm3 1076 1077L(shr_12_gobble_loop): 1078 pand %xmm0, %xmm3 1079 sub $32, %rcx 1080 pmovmskb %xmm3, %edx 1081 movdqa %xmm0, %xmm1 1082 1083 movdqa 64(%rsi), %xmm3 1084 palignr $12, 48(%rsi), %xmm3 1085 sbb $0xffff, %edx 1086 movdqa 48(%rsi), %xmm0 1087 palignr $12, 32(%rsi), %xmm0 1088 pcmpeqb 32(%rdi), %xmm0 1089 lea 32(%rsi), %rsi 1090 pcmpeqb 48(%rdi), %xmm3 1091 1092 lea 32(%rdi), %rdi 1093 jz L(shr_12_gobble_loop) 1094 pand %xmm0, %xmm3 1095 1096 cmp $0, %rcx 1097 jge L(shr_12_gobble_next) 1098 inc %edx 1099 add $32, %rcx 1100L(shr_12_gobble_next): 1101 test %edx, %edx 1102 jnz L(exit) 1103 1104 pmovmskb %xmm3, %edx 1105 movdqa %xmm0, %xmm1 1106 lea 32(%rdi), %rdi 1107 lea 32(%rsi), %rsi 1108 sub $0xffff, %edx 1109 jnz L(exit) 1110 1111 lea 12(%rsi), %rsi 1112 add %rcx, %rsi 1113 add %rcx, %rdi 1114 jmp L(less48bytes) 1115 1116# ifndef USE_AS_WMEMCMP 1117 1118 .p2align 4 1119L(shr_13): 1120 cmp $80, %rcx 1121 lea -48(%rcx), %rcx 1122 mov %edx, %eax 1123 jae L(shr_13_gobble) 1124 1125 movdqa 16(%rsi), %xmm1 1126 movdqa %xmm1, %xmm2 1127 palignr $13, (%rsi), %xmm1 1128 pcmpeqb (%rdi), %xmm1 1129 1130 movdqa 32(%rsi), %xmm3 1131 palignr $13, %xmm2, %xmm3 1132 pcmpeqb 16(%rdi), %xmm3 1133 1134 pand %xmm1, %xmm3 1135 pmovmskb %xmm3, %edx 1136 lea 32(%rdi), %rdi 1137 lea 32(%rsi), %rsi 1138 sub $0xffff, %edx 1139 jnz L(exit) 1140 add $13, %rsi 1141 add %rcx, %rsi 1142 add %rcx, %rdi 1143 jmp L(less48bytes) 1144 1145 .p2align 4 1146L(shr_13_gobble): 1147 sub $32, %rcx 1148 movdqa 16(%rsi), %xmm0 1149 palignr $13, (%rsi), %xmm0 1150 pcmpeqb (%rdi), %xmm0 1151 1152 movdqa 32(%rsi), %xmm3 1153 palignr $13, 16(%rsi), %xmm3 1154 pcmpeqb 16(%rdi), %xmm3 1155 1156L(shr_13_gobble_loop): 1157 pand %xmm0, %xmm3 1158 sub $32, %rcx 1159 pmovmskb %xmm3, %edx 1160 movdqa %xmm0, %xmm1 1161 1162 movdqa 64(%rsi), %xmm3 1163 palignr $13, 48(%rsi), %xmm3 1164 sbb $0xffff, %edx 1165 movdqa 48(%rsi), %xmm0 1166 palignr $13, 32(%rsi), %xmm0 1167 pcmpeqb 32(%rdi), %xmm0 1168 lea 32(%rsi), %rsi 1169 pcmpeqb 48(%rdi), %xmm3 1170 1171 lea 32(%rdi), %rdi 1172 jz L(shr_13_gobble_loop) 1173 pand %xmm0, %xmm3 1174 1175 cmp $0, %rcx 1176 jge L(shr_13_gobble_next) 1177 inc %edx 1178 add $32, %rcx 1179L(shr_13_gobble_next): 1180 test %edx, %edx 1181 jnz L(exit) 1182 1183 pmovmskb %xmm3, %edx 1184 movdqa %xmm0, %xmm1 1185 lea 32(%rdi), %rdi 1186 lea 32(%rsi), %rsi 1187 sub $0xffff, %edx 1188 jnz L(exit) 1189 1190 lea 13(%rsi), %rsi 1191 add %rcx, %rsi 1192 add %rcx, %rdi 1193 jmp L(less48bytes) 1194 1195 .p2align 4 1196L(shr_14): 1197 cmp $80, %rcx 1198 lea -48(%rcx), %rcx 1199 mov %edx, %eax 1200 jae L(shr_14_gobble) 1201 1202 movdqa 16(%rsi), %xmm1 1203 movdqa %xmm1, %xmm2 1204 palignr $14, (%rsi), %xmm1 1205 pcmpeqb (%rdi), %xmm1 1206 1207 movdqa 32(%rsi), %xmm3 1208 palignr $14, %xmm2, %xmm3 1209 pcmpeqb 16(%rdi), %xmm3 1210 1211 pand %xmm1, %xmm3 1212 pmovmskb %xmm3, %edx 1213 lea 32(%rdi), %rdi 1214 lea 32(%rsi), %rsi 1215 sub $0xffff, %edx 1216 jnz L(exit) 1217 add $14, %rsi 1218 add %rcx, %rsi 1219 add %rcx, %rdi 1220 jmp L(less48bytes) 1221 1222 .p2align 4 1223L(shr_14_gobble): 1224 sub $32, %rcx 1225 movdqa 16(%rsi), %xmm0 1226 palignr $14, (%rsi), %xmm0 1227 pcmpeqb (%rdi), %xmm0 1228 1229 movdqa 32(%rsi), %xmm3 1230 palignr $14, 16(%rsi), %xmm3 1231 pcmpeqb 16(%rdi), %xmm3 1232 1233L(shr_14_gobble_loop): 1234 pand %xmm0, %xmm3 1235 sub $32, %rcx 1236 pmovmskb %xmm3, %edx 1237 movdqa %xmm0, %xmm1 1238 1239 movdqa 64(%rsi), %xmm3 1240 palignr $14, 48(%rsi), %xmm3 1241 sbb $0xffff, %edx 1242 movdqa 48(%rsi), %xmm0 1243 palignr $14, 32(%rsi), %xmm0 1244 pcmpeqb 32(%rdi), %xmm0 1245 lea 32(%rsi), %rsi 1246 pcmpeqb 48(%rdi), %xmm3 1247 1248 lea 32(%rdi), %rdi 1249 jz L(shr_14_gobble_loop) 1250 pand %xmm0, %xmm3 1251 1252 cmp $0, %rcx 1253 jge L(shr_14_gobble_next) 1254 inc %edx 1255 add $32, %rcx 1256L(shr_14_gobble_next): 1257 test %edx, %edx 1258 jnz L(exit) 1259 1260 pmovmskb %xmm3, %edx 1261 movdqa %xmm0, %xmm1 1262 lea 32(%rdi), %rdi 1263 lea 32(%rsi), %rsi 1264 sub $0xffff, %edx 1265 jnz L(exit) 1266 1267 lea 14(%rsi), %rsi 1268 add %rcx, %rsi 1269 add %rcx, %rdi 1270 jmp L(less48bytes) 1271 1272 .p2align 4 1273L(shr_15): 1274 cmp $80, %rcx 1275 lea -48(%rcx), %rcx 1276 mov %edx, %eax 1277 jae L(shr_15_gobble) 1278 1279 movdqa 16(%rsi), %xmm1 1280 movdqa %xmm1, %xmm2 1281 palignr $15, (%rsi), %xmm1 1282 pcmpeqb (%rdi), %xmm1 1283 1284 movdqa 32(%rsi), %xmm3 1285 palignr $15, %xmm2, %xmm3 1286 pcmpeqb 16(%rdi), %xmm3 1287 1288 pand %xmm1, %xmm3 1289 pmovmskb %xmm3, %edx 1290 lea 32(%rdi), %rdi 1291 lea 32(%rsi), %rsi 1292 sub $0xffff, %edx 1293 jnz L(exit) 1294 add $15, %rsi 1295 add %rcx, %rsi 1296 add %rcx, %rdi 1297 jmp L(less48bytes) 1298 1299 .p2align 4 1300L(shr_15_gobble): 1301 sub $32, %rcx 1302 movdqa 16(%rsi), %xmm0 1303 palignr $15, (%rsi), %xmm0 1304 pcmpeqb (%rdi), %xmm0 1305 1306 movdqa 32(%rsi), %xmm3 1307 palignr $15, 16(%rsi), %xmm3 1308 pcmpeqb 16(%rdi), %xmm3 1309 1310L(shr_15_gobble_loop): 1311 pand %xmm0, %xmm3 1312 sub $32, %rcx 1313 pmovmskb %xmm3, %edx 1314 movdqa %xmm0, %xmm1 1315 1316 movdqa 64(%rsi), %xmm3 1317 palignr $15, 48(%rsi), %xmm3 1318 sbb $0xffff, %edx 1319 movdqa 48(%rsi), %xmm0 1320 palignr $15, 32(%rsi), %xmm0 1321 pcmpeqb 32(%rdi), %xmm0 1322 lea 32(%rsi), %rsi 1323 pcmpeqb 48(%rdi), %xmm3 1324 1325 lea 32(%rdi), %rdi 1326 jz L(shr_15_gobble_loop) 1327 pand %xmm0, %xmm3 1328 1329 cmp $0, %rcx 1330 jge L(shr_15_gobble_next) 1331 inc %edx 1332 add $32, %rcx 1333L(shr_15_gobble_next): 1334 test %edx, %edx 1335 jnz L(exit) 1336 1337 pmovmskb %xmm3, %edx 1338 movdqa %xmm0, %xmm1 1339 lea 32(%rdi), %rdi 1340 lea 32(%rsi), %rsi 1341 sub $0xffff, %edx 1342 jnz L(exit) 1343 1344 lea 15(%rsi), %rsi 1345 add %rcx, %rsi 1346 add %rcx, %rdi 1347 jmp L(less48bytes) 1348# endif 1349 .p2align 4 1350L(exit): 1351 pmovmskb %xmm1, %r8d 1352 sub $0xffff, %r8d 1353 jz L(first16bytes) 1354 lea -16(%rsi), %rsi 1355 lea -16(%rdi), %rdi 1356 mov %r8d, %edx 1357L(first16bytes): 1358 add %rax, %rsi 1359L(less16bytes): 1360# ifndef USE_AS_WMEMCMP 1361 test %dl, %dl 1362 jz L(next_24_bytes) 1363 1364 test $0x01, %dl 1365 jnz L(Byte16) 1366 1367 test $0x02, %dl 1368 jnz L(Byte17) 1369 1370 test $0x04, %dl 1371 jnz L(Byte18) 1372 1373 test $0x08, %dl 1374 jnz L(Byte19) 1375 1376 test $0x10, %dl 1377 jnz L(Byte20) 1378 1379 test $0x20, %dl 1380 jnz L(Byte21) 1381 1382 test $0x40, %dl 1383 jnz L(Byte22) 1384 1385 movzbl -9(%rdi), %eax 1386 movzbl -9(%rsi), %edx 1387 sub %edx, %eax 1388 ret 1389 1390 .p2align 4 1391L(Byte16): 1392 movzbl -16(%rdi), %eax 1393 movzbl -16(%rsi), %edx 1394 sub %edx, %eax 1395 ret 1396 1397 .p2align 4 1398L(Byte17): 1399 movzbl -15(%rdi), %eax 1400 movzbl -15(%rsi), %edx 1401 sub %edx, %eax 1402 ret 1403 1404 .p2align 4 1405L(Byte18): 1406 movzbl -14(%rdi), %eax 1407 movzbl -14(%rsi), %edx 1408 sub %edx, %eax 1409 ret 1410 1411 .p2align 4 1412L(Byte19): 1413 movzbl -13(%rdi), %eax 1414 movzbl -13(%rsi), %edx 1415 sub %edx, %eax 1416 ret 1417 1418 .p2align 4 1419L(Byte20): 1420 movzbl -12(%rdi), %eax 1421 movzbl -12(%rsi), %edx 1422 sub %edx, %eax 1423 ret 1424 1425 .p2align 4 1426L(Byte21): 1427 movzbl -11(%rdi), %eax 1428 movzbl -11(%rsi), %edx 1429 sub %edx, %eax 1430 ret 1431 1432 .p2align 4 1433L(Byte22): 1434 movzbl -10(%rdi), %eax 1435 movzbl -10(%rsi), %edx 1436 sub %edx, %eax 1437 ret 1438 1439 .p2align 4 1440L(next_24_bytes): 1441 lea 8(%rdi), %rdi 1442 lea 8(%rsi), %rsi 1443 test $0x01, %dh 1444 jnz L(Byte16) 1445 1446 test $0x02, %dh 1447 jnz L(Byte17) 1448 1449 test $0x04, %dh 1450 jnz L(Byte18) 1451 1452 test $0x08, %dh 1453 jnz L(Byte19) 1454 1455 test $0x10, %dh 1456 jnz L(Byte20) 1457 1458 test $0x20, %dh 1459 jnz L(Byte21) 1460 1461 test $0x40, %dh 1462 jnz L(Byte22) 1463 1464 movzbl -9(%rdi), %eax 1465 movzbl -9(%rsi), %edx 1466 sub %edx, %eax 1467 ret 1468# else 1469/* special for wmemcmp */ 1470 xor %eax, %eax 1471 test %dl, %dl 1472 jz L(next_two_double_words) 1473 and $15, %dl 1474 jz L(second_double_word) 1475 mov -16(%rdi), %eax 1476 cmp -16(%rsi), %eax 1477 jne L(find_diff) 1478 ret 1479 1480 .p2align 4 1481L(second_double_word): 1482 mov -12(%rdi), %eax 1483 cmp -12(%rsi), %eax 1484 jne L(find_diff) 1485 ret 1486 1487 .p2align 4 1488L(next_two_double_words): 1489 and $15, %dh 1490 jz L(fourth_double_word) 1491 mov -8(%rdi), %eax 1492 cmp -8(%rsi), %eax 1493 jne L(find_diff) 1494 ret 1495 1496 .p2align 4 1497L(fourth_double_word): 1498 mov -4(%rdi), %eax 1499 cmp -4(%rsi), %eax 1500 jne L(find_diff) 1501 ret 1502# endif 1503 1504 .p2align 4 1505L(less48bytes): 1506 cmp $8, %ecx 1507 jae L(more8bytes) 1508 cmp $0, %ecx 1509 je L(0bytes) 1510# ifndef USE_AS_WMEMCMP 1511 cmp $1, %ecx 1512 je L(1bytes) 1513 cmp $2, %ecx 1514 je L(2bytes) 1515 cmp $3, %ecx 1516 je L(3bytes) 1517 cmp $4, %ecx 1518 je L(4bytes) 1519 cmp $5, %ecx 1520 je L(5bytes) 1521 cmp $6, %ecx 1522 je L(6bytes) 1523 jmp L(7bytes) 1524# else 1525 jmp L(4bytes) 1526# endif 1527 1528 .p2align 4 1529L(more8bytes): 1530 cmp $16, %ecx 1531 jae L(more16bytes) 1532 cmp $8, %ecx 1533 je L(8bytes) 1534# ifndef USE_AS_WMEMCMP 1535 cmp $9, %ecx 1536 je L(9bytes) 1537 cmp $10, %ecx 1538 je L(10bytes) 1539 cmp $11, %ecx 1540 je L(11bytes) 1541 cmp $12, %ecx 1542 je L(12bytes) 1543 cmp $13, %ecx 1544 je L(13bytes) 1545 cmp $14, %ecx 1546 je L(14bytes) 1547 jmp L(15bytes) 1548# else 1549 jmp L(12bytes) 1550# endif 1551 1552 .p2align 4 1553L(more16bytes): 1554 cmp $24, %ecx 1555 jae L(more24bytes) 1556 cmp $16, %ecx 1557 je L(16bytes) 1558# ifndef USE_AS_WMEMCMP 1559 cmp $17, %ecx 1560 je L(17bytes) 1561 cmp $18, %ecx 1562 je L(18bytes) 1563 cmp $19, %ecx 1564 je L(19bytes) 1565 cmp $20, %ecx 1566 je L(20bytes) 1567 cmp $21, %ecx 1568 je L(21bytes) 1569 cmp $22, %ecx 1570 je L(22bytes) 1571 jmp L(23bytes) 1572# else 1573 jmp L(20bytes) 1574# endif 1575 1576 .p2align 4 1577L(more24bytes): 1578 cmp $32, %ecx 1579 jae L(more32bytes) 1580 cmp $24, %ecx 1581 je L(24bytes) 1582# ifndef USE_AS_WMEMCMP 1583 cmp $25, %ecx 1584 je L(25bytes) 1585 cmp $26, %ecx 1586 je L(26bytes) 1587 cmp $27, %ecx 1588 je L(27bytes) 1589 cmp $28, %ecx 1590 je L(28bytes) 1591 cmp $29, %ecx 1592 je L(29bytes) 1593 cmp $30, %ecx 1594 je L(30bytes) 1595 jmp L(31bytes) 1596# else 1597 jmp L(28bytes) 1598# endif 1599 1600 .p2align 4 1601L(more32bytes): 1602 cmp $40, %ecx 1603 jae L(more40bytes) 1604 cmp $32, %ecx 1605 je L(32bytes) 1606# ifndef USE_AS_WMEMCMP 1607 cmp $33, %ecx 1608 je L(33bytes) 1609 cmp $34, %ecx 1610 je L(34bytes) 1611 cmp $35, %ecx 1612 je L(35bytes) 1613 cmp $36, %ecx 1614 je L(36bytes) 1615 cmp $37, %ecx 1616 je L(37bytes) 1617 cmp $38, %ecx 1618 je L(38bytes) 1619 jmp L(39bytes) 1620# else 1621 jmp L(36bytes) 1622# endif 1623 1624 .p2align 4 1625L(more40bytes): 1626 cmp $40, %ecx 1627 je L(40bytes) 1628# ifndef USE_AS_WMEMCMP 1629 cmp $41, %ecx 1630 je L(41bytes) 1631 cmp $42, %ecx 1632 je L(42bytes) 1633 cmp $43, %ecx 1634 je L(43bytes) 1635 cmp $44, %ecx 1636 je L(44bytes) 1637 cmp $45, %ecx 1638 je L(45bytes) 1639 cmp $46, %ecx 1640 je L(46bytes) 1641 jmp L(47bytes) 1642 1643 .p2align 4 1644L(44bytes): 1645 movl -44(%rdi), %eax 1646 movl -44(%rsi), %ecx 1647 cmp %ecx, %eax 1648 jne L(find_diff) 1649L(40bytes): 1650 movl -40(%rdi), %eax 1651 movl -40(%rsi), %ecx 1652 cmp %ecx, %eax 1653 jne L(find_diff) 1654L(36bytes): 1655 movl -36(%rdi), %eax 1656 movl -36(%rsi), %ecx 1657 cmp %ecx, %eax 1658 jne L(find_diff) 1659L(32bytes): 1660 movl -32(%rdi), %eax 1661 movl -32(%rsi), %ecx 1662 cmp %ecx, %eax 1663 jne L(find_diff) 1664L(28bytes): 1665 movl -28(%rdi), %eax 1666 movl -28(%rsi), %ecx 1667 cmp %ecx, %eax 1668 jne L(find_diff) 1669L(24bytes): 1670 movl -24(%rdi), %eax 1671 movl -24(%rsi), %ecx 1672 cmp %ecx, %eax 1673 jne L(find_diff) 1674L(20bytes): 1675 movl -20(%rdi), %eax 1676 movl -20(%rsi), %ecx 1677 cmp %ecx, %eax 1678 jne L(find_diff) 1679L(16bytes): 1680 movl -16(%rdi), %eax 1681 movl -16(%rsi), %ecx 1682 cmp %ecx, %eax 1683 jne L(find_diff) 1684L(12bytes): 1685 movl -12(%rdi), %eax 1686 movl -12(%rsi), %ecx 1687 cmp %ecx, %eax 1688 jne L(find_diff) 1689L(8bytes): 1690 movl -8(%rdi), %eax 1691 movl -8(%rsi), %ecx 1692 cmp %ecx, %eax 1693 jne L(find_diff) 1694L(4bytes): 1695 movl -4(%rdi), %eax 1696 movl -4(%rsi), %ecx 1697 cmp %ecx, %eax 1698 jne L(find_diff) 1699L(0bytes): 1700 xor %eax, %eax 1701 ret 1702# else 1703 .p2align 4 1704L(44bytes): 1705 movl -44(%rdi), %eax 1706 cmp -44(%rsi), %eax 1707 jne L(find_diff) 1708L(40bytes): 1709 movl -40(%rdi), %eax 1710 cmp -40(%rsi), %eax 1711 jne L(find_diff) 1712L(36bytes): 1713 movl -36(%rdi), %eax 1714 cmp -36(%rsi), %eax 1715 jne L(find_diff) 1716L(32bytes): 1717 movl -32(%rdi), %eax 1718 cmp -32(%rsi), %eax 1719 jne L(find_diff) 1720L(28bytes): 1721 movl -28(%rdi), %eax 1722 cmp -28(%rsi), %eax 1723 jne L(find_diff) 1724L(24bytes): 1725 movl -24(%rdi), %eax 1726 cmp -24(%rsi), %eax 1727 jne L(find_diff) 1728L(20bytes): 1729 movl -20(%rdi), %eax 1730 cmp -20(%rsi), %eax 1731 jne L(find_diff) 1732L(16bytes): 1733 movl -16(%rdi), %eax 1734 cmp -16(%rsi), %eax 1735 jne L(find_diff) 1736L(12bytes): 1737 movl -12(%rdi), %eax 1738 cmp -12(%rsi), %eax 1739 jne L(find_diff) 1740L(8bytes): 1741 movl -8(%rdi), %eax 1742 cmp -8(%rsi), %eax 1743 jne L(find_diff) 1744L(4bytes): 1745 movl -4(%rdi), %eax 1746 cmp -4(%rsi), %eax 1747 jne L(find_diff) 1748L(0bytes): 1749 xor %eax, %eax 1750 ret 1751# endif 1752 1753# ifndef USE_AS_WMEMCMP 1754 .p2align 4 1755L(45bytes): 1756 movl -45(%rdi), %eax 1757 movl -45(%rsi), %ecx 1758 cmp %ecx, %eax 1759 jne L(find_diff) 1760L(41bytes): 1761 movl -41(%rdi), %eax 1762 movl -41(%rsi), %ecx 1763 cmp %ecx, %eax 1764 jne L(find_diff) 1765L(37bytes): 1766 movl -37(%rdi), %eax 1767 movl -37(%rsi), %ecx 1768 cmp %ecx, %eax 1769 jne L(find_diff) 1770L(33bytes): 1771 movl -33(%rdi), %eax 1772 movl -33(%rsi), %ecx 1773 cmp %ecx, %eax 1774 jne L(find_diff) 1775L(29bytes): 1776 movl -29(%rdi), %eax 1777 movl -29(%rsi), %ecx 1778 cmp %ecx, %eax 1779 jne L(find_diff) 1780L(25bytes): 1781 movl -25(%rdi), %eax 1782 movl -25(%rsi), %ecx 1783 cmp %ecx, %eax 1784 jne L(find_diff) 1785L(21bytes): 1786 movl -21(%rdi), %eax 1787 movl -21(%rsi), %ecx 1788 cmp %ecx, %eax 1789 jne L(find_diff) 1790L(17bytes): 1791 movl -17(%rdi), %eax 1792 movl -17(%rsi), %ecx 1793 cmp %ecx, %eax 1794 jne L(find_diff) 1795L(13bytes): 1796 movl -13(%rdi), %eax 1797 movl -13(%rsi), %ecx 1798 cmp %ecx, %eax 1799 jne L(find_diff) 1800L(9bytes): 1801 movl -9(%rdi), %eax 1802 movl -9(%rsi), %ecx 1803 cmp %ecx, %eax 1804 jne L(find_diff) 1805L(5bytes): 1806 movl -5(%rdi), %eax 1807 movl -5(%rsi), %ecx 1808 cmp %ecx, %eax 1809 jne L(find_diff) 1810L(1bytes): 1811 movzbl -1(%rdi), %eax 1812 cmpb -1(%rsi), %al 1813 jne L(set) 1814 xor %eax, %eax 1815 ret 1816 1817 .p2align 4 1818L(46bytes): 1819 movl -46(%rdi), %eax 1820 movl -46(%rsi), %ecx 1821 cmp %ecx, %eax 1822 jne L(find_diff) 1823L(42bytes): 1824 movl -42(%rdi), %eax 1825 movl -42(%rsi), %ecx 1826 cmp %ecx, %eax 1827 jne L(find_diff) 1828L(38bytes): 1829 movl -38(%rdi), %eax 1830 movl -38(%rsi), %ecx 1831 cmp %ecx, %eax 1832 jne L(find_diff) 1833L(34bytes): 1834 movl -34(%rdi), %eax 1835 movl -34(%rsi), %ecx 1836 cmp %ecx, %eax 1837 jne L(find_diff) 1838L(30bytes): 1839 movl -30(%rdi), %eax 1840 movl -30(%rsi), %ecx 1841 cmp %ecx, %eax 1842 jne L(find_diff) 1843L(26bytes): 1844 movl -26(%rdi), %eax 1845 movl -26(%rsi), %ecx 1846 cmp %ecx, %eax 1847 jne L(find_diff) 1848L(22bytes): 1849 movl -22(%rdi), %eax 1850 movl -22(%rsi), %ecx 1851 cmp %ecx, %eax 1852 jne L(find_diff) 1853L(18bytes): 1854 movl -18(%rdi), %eax 1855 movl -18(%rsi), %ecx 1856 cmp %ecx, %eax 1857 jne L(find_diff) 1858L(14bytes): 1859 movl -14(%rdi), %eax 1860 movl -14(%rsi), %ecx 1861 cmp %ecx, %eax 1862 jne L(find_diff) 1863L(10bytes): 1864 movl -10(%rdi), %eax 1865 movl -10(%rsi), %ecx 1866 cmp %ecx, %eax 1867 jne L(find_diff) 1868L(6bytes): 1869 movl -6(%rdi), %eax 1870 movl -6(%rsi), %ecx 1871 cmp %ecx, %eax 1872 jne L(find_diff) 1873L(2bytes): 1874 movzwl -2(%rdi), %eax 1875 movzwl -2(%rsi), %ecx 1876 cmpb %cl, %al 1877 jne L(set) 1878 cmp %ecx, %eax 1879 jne L(set) 1880 xor %eax, %eax 1881 ret 1882 1883 .p2align 4 1884L(47bytes): 1885 movl -47(%rdi), %eax 1886 movl -47(%rsi), %ecx 1887 cmp %ecx, %eax 1888 jne L(find_diff) 1889L(43bytes): 1890 movl -43(%rdi), %eax 1891 movl -43(%rsi), %ecx 1892 cmp %ecx, %eax 1893 jne L(find_diff) 1894L(39bytes): 1895 movl -39(%rdi), %eax 1896 movl -39(%rsi), %ecx 1897 cmp %ecx, %eax 1898 jne L(find_diff) 1899L(35bytes): 1900 movl -35(%rdi), %eax 1901 movl -35(%rsi), %ecx 1902 cmp %ecx, %eax 1903 jne L(find_diff) 1904L(31bytes): 1905 movl -31(%rdi), %eax 1906 movl -31(%rsi), %ecx 1907 cmp %ecx, %eax 1908 jne L(find_diff) 1909L(27bytes): 1910 movl -27(%rdi), %eax 1911 movl -27(%rsi), %ecx 1912 cmp %ecx, %eax 1913 jne L(find_diff) 1914L(23bytes): 1915 movl -23(%rdi), %eax 1916 movl -23(%rsi), %ecx 1917 cmp %ecx, %eax 1918 jne L(find_diff) 1919L(19bytes): 1920 movl -19(%rdi), %eax 1921 movl -19(%rsi), %ecx 1922 cmp %ecx, %eax 1923 jne L(find_diff) 1924L(15bytes): 1925 movl -15(%rdi), %eax 1926 movl -15(%rsi), %ecx 1927 cmp %ecx, %eax 1928 jne L(find_diff) 1929L(11bytes): 1930 movl -11(%rdi), %eax 1931 movl -11(%rsi), %ecx 1932 cmp %ecx, %eax 1933 jne L(find_diff) 1934L(7bytes): 1935 movl -7(%rdi), %eax 1936 movl -7(%rsi), %ecx 1937 cmp %ecx, %eax 1938 jne L(find_diff) 1939L(3bytes): 1940 movzwl -3(%rdi), %eax 1941 movzwl -3(%rsi), %ecx 1942 cmpb %cl, %al 1943 jne L(set) 1944 cmp %ecx, %eax 1945 jne L(set) 1946 movzbl -1(%rdi), %eax 1947 cmpb -1(%rsi), %al 1948 jne L(set) 1949 xor %eax, %eax 1950 ret 1951 1952 .p2align 4 1953L(find_diff): 1954 cmpb %cl, %al 1955 jne L(set) 1956 cmpw %cx, %ax 1957 jne L(set) 1958 shr $16, %eax 1959 shr $16, %ecx 1960 cmpb %cl, %al 1961 jne L(set) 1962 1963/* We get there only if we already know there is a 1964difference. */ 1965 1966 cmp %ecx, %eax 1967L(set): 1968 sbb %eax, %eax 1969 sbb $-1, %eax 1970 ret 1971# else 1972 1973/* for wmemcmp */ 1974 .p2align 4 1975L(find_diff): 1976 mov $1, %eax 1977 jg L(find_diff_bigger) 1978 neg %eax 1979 ret 1980 1981 .p2align 4 1982L(find_diff_bigger): 1983 ret 1984# endif 1985 1986 .p2align 4 1987L(equal): 1988 xor %eax, %eax 1989 ret 1990 1991END (MEMCMP) 1992#endif 1993