1/* Highly optimized version for x86-64. 2 Copyright (C) 1999-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "asm-syntax.h" 21 22#undef UPDATE_STRNCMP_COUNTER 23 24#ifndef LABEL 25#define LABEL(l) L(l) 26#endif 27 28#ifdef USE_AS_STRNCMP 29/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz 30 if the new counter > the old one or is 0. */ 31# define UPDATE_STRNCMP_COUNTER \ 32 /* calculate left number to compare */ \ 33 lea -16(%rcx, %r11), %r9; \ 34 cmp %r9, %r11; \ 35 jb LABEL(strcmp_exitz); \ 36 test %r9, %r9; \ 37 je LABEL(strcmp_exitz); \ 38 mov %r9, %r11 39 40#elif defined USE_AS_STRCASECMP_L 41# include "locale-defines.h" 42 43# define UPDATE_STRNCMP_COUNTER 44#elif defined USE_AS_STRNCASECMP_L 45# include "locale-defines.h" 46 47# define UPDATE_STRNCMP_COUNTER \ 48 /* calculate left number to compare */ \ 49 lea -16(%rcx, %r11), %r9; \ 50 cmp %r9, %r11; \ 51 jb LABEL(strcmp_exitz); \ 52 test %r9, %r9; \ 53 je LABEL(strcmp_exitz); \ 54 mov %r9, %r11 55#else 56# define UPDATE_STRNCMP_COUNTER 57# ifndef STRCMP 58# define STRCMP strcmp 59# endif 60#endif 61 62#ifndef USE_SSSE3 63 .text 64#else 65 .section .text.ssse3,"ax",@progbits 66#endif 67 68#ifdef USE_AS_STRCASECMP_L 69# ifndef ENTRY2 70# define ENTRY2(name) ENTRY (name) 71# define END2(name) END (name) 72# endif 73 74ENTRY2 (__strcasecmp) 75 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 76 mov %fs:(%rax),%RDX_LP 77 78 // XXX 5 byte should be before the function 79 /* 5-byte NOP. */ 80 .byte 0x0f,0x1f,0x44,0x00,0x00 81END2 (__strcasecmp) 82# ifndef NO_NOLOCALE_ALIAS 83weak_alias (__strcasecmp, strcasecmp) 84libc_hidden_def (__strcasecmp) 85# endif 86 /* FALLTHROUGH to strcasecmp_l. */ 87#elif defined USE_AS_STRNCASECMP_L 88# ifndef ENTRY2 89# define ENTRY2(name) ENTRY (name) 90# define END2(name) END (name) 91# endif 92 93ENTRY2 (__strncasecmp) 94 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 95 mov %fs:(%rax),%RCX_LP 96 97 // XXX 5 byte should be before the function 98 /* 5-byte NOP. */ 99 .byte 0x0f,0x1f,0x44,0x00,0x00 100END2 (__strncasecmp) 101# ifndef NO_NOLOCALE_ALIAS 102weak_alias (__strncasecmp, strncasecmp) 103libc_hidden_def (__strncasecmp) 104# endif 105 /* FALLTHROUGH to strncasecmp_l. */ 106#endif 107 108ENTRY (STRCMP) 109#ifdef USE_AS_STRCASECMP_L 110 /* We have to fall back on the C implementation for locales 111 with encodings not matching ASCII for single bytes. */ 112# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 113 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP 114# else 115 mov (%rdx), %RAX_LP 116# endif 117 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 118 jne __strcasecmp_l_nonascii 119#elif defined USE_AS_STRNCASECMP_L 120 /* We have to fall back on the C implementation for locales 121 with encodings not matching ASCII for single bytes. */ 122# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 123 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP 124# else 125 mov (%rcx), %RAX_LP 126# endif 127 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 128 jne __strncasecmp_l_nonascii 129#endif 130 131/* 132 * This implementation uses SSE to compare up to 16 bytes at a time. 133 */ 134#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 135 test %RDX_LP, %RDX_LP 136 je LABEL(strcmp_exitz) 137 cmp $1, %RDX_LP 138 je LABEL(Byte0) 139 mov %RDX_LP, %R11_LP 140#endif 141 mov %esi, %ecx 142 mov %edi, %eax 143/* Use 64bit AND here to avoid long NOP padding. */ 144 and $0x3f, %rcx /* rsi alignment in cache line */ 145 and $0x3f, %rax /* rdi alignment in cache line */ 146#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 147 .section .rodata.cst16,"aM",@progbits,16 148 .align 16 149.Lbelowupper: 150 .quad 0x4040404040404040 151 .quad 0x4040404040404040 152.Ltopupper: 153 .quad 0x5b5b5b5b5b5b5b5b 154 .quad 0x5b5b5b5b5b5b5b5b 155.Ltouppermask: 156 .quad 0x2020202020202020 157 .quad 0x2020202020202020 158 .previous 159 movdqa .Lbelowupper(%rip), %xmm5 160# define UCLOW_reg %xmm5 161 movdqa .Ltopupper(%rip), %xmm6 162# define UCHIGH_reg %xmm6 163 movdqa .Ltouppermask(%rip), %xmm7 164# define LCQWORD_reg %xmm7 165#endif 166 cmp $0x30, %ecx 167 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ 168 cmp $0x30, %eax 169 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ 170 movlpd (%rdi), %xmm1 171 movlpd (%rsi), %xmm2 172 movhpd 8(%rdi), %xmm1 173 movhpd 8(%rsi), %xmm2 174#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 175# define TOLOWER(reg1, reg2) \ 176 movdqa reg1, %xmm8; \ 177 movdqa UCHIGH_reg, %xmm9; \ 178 movdqa reg2, %xmm10; \ 179 movdqa UCHIGH_reg, %xmm11; \ 180 pcmpgtb UCLOW_reg, %xmm8; \ 181 pcmpgtb reg1, %xmm9; \ 182 pcmpgtb UCLOW_reg, %xmm10; \ 183 pcmpgtb reg2, %xmm11; \ 184 pand %xmm9, %xmm8; \ 185 pand %xmm11, %xmm10; \ 186 pand LCQWORD_reg, %xmm8; \ 187 pand LCQWORD_reg, %xmm10; \ 188 por %xmm8, reg1; \ 189 por %xmm10, reg2 190 TOLOWER (%xmm1, %xmm2) 191#else 192# define TOLOWER(reg1, reg2) 193#endif 194 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 195 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 196 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 197 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 198 pmovmskb %xmm1, %edx 199 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 200 jnz LABEL(less16bytes) /* If not, find different value or null char */ 201#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 202 sub $16, %r11 203 jbe LABEL(strcmp_exitz) /* finish comparision */ 204#endif 205 add $16, %rsi /* prepare to search next 16 bytes */ 206 add $16, %rdi /* prepare to search next 16 bytes */ 207 208 /* 209 * Determine source and destination string offsets from 16-byte alignment. 210 * Use relative offset difference between the two to determine which case 211 * below to use. 212 */ 213 .p2align 4 214LABEL(crosscache): 215 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ 216 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ 217 mov $0xffff, %edx /* for equivalent offset */ 218 xor %r8d, %r8d 219 and $0xf, %ecx /* offset of rsi */ 220 and $0xf, %eax /* offset of rdi */ 221 cmp %eax, %ecx 222 je LABEL(ashr_0) /* rsi and rdi relative offset same */ 223 ja LABEL(bigger) 224 mov %edx, %r8d /* r8d is offset flag for exit tail */ 225 xchg %ecx, %eax 226 xchg %rsi, %rdi 227LABEL(bigger): 228 lea 15(%rax), %r9 229 sub %rcx, %r9 230 lea LABEL(unaligned_table)(%rip), %r10 231 movslq (%r10, %r9,4), %r9 232 lea (%r10, %r9), %r10 233 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ 234 235/* 236 * The following cases will be handled by ashr_0 237 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 238 * n(0~15) n(0~15) 15(15+ n-n) ashr_0 239 */ 240 .p2align 4 241LABEL(ashr_0): 242 243 movdqa (%rsi), %xmm1 244 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 245 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 246#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 247 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 248#else 249 movdqa (%rdi), %xmm2 250 TOLOWER (%xmm1, %xmm2) 251 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ 252#endif 253 psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 254 pmovmskb %xmm1, %r9d 255 shr %cl, %edx /* adjust 0xffff for offset */ 256 shr %cl, %r9d /* adjust for 16-byte offset */ 257 sub %r9d, %edx 258 /* 259 * edx must be the same with r9d if in left byte (16-rcx) is equal to 260 * the start from (16-rax) and no null char was seen. 261 */ 262 jne LABEL(less32bytes) /* mismatch or null char */ 263 UPDATE_STRNCMP_COUNTER 264 mov $16, %rcx 265 mov $16, %r9 266 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 267 268 /* 269 * Now both strings are aligned at 16-byte boundary. Loop over strings 270 * checking 32-bytes per iteration. 271 */ 272 .p2align 4 273LABEL(loop_ashr_0): 274 movdqa (%rsi, %rcx), %xmm1 275 movdqa (%rdi, %rcx), %xmm2 276 TOLOWER (%xmm1, %xmm2) 277 278 pcmpeqb %xmm1, %xmm0 279 pcmpeqb %xmm2, %xmm1 280 psubb %xmm0, %xmm1 281 pmovmskb %xmm1, %edx 282 sub $0xffff, %edx 283 jnz LABEL(exit) /* mismatch or null char seen */ 284 285#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 286 sub $16, %r11 287 jbe LABEL(strcmp_exitz) 288#endif 289 add $16, %rcx 290 movdqa (%rsi, %rcx), %xmm1 291 movdqa (%rdi, %rcx), %xmm2 292 TOLOWER (%xmm1, %xmm2) 293 294 pcmpeqb %xmm1, %xmm0 295 pcmpeqb %xmm2, %xmm1 296 psubb %xmm0, %xmm1 297 pmovmskb %xmm1, %edx 298 sub $0xffff, %edx 299 jnz LABEL(exit) 300#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 301 sub $16, %r11 302 jbe LABEL(strcmp_exitz) 303#endif 304 add $16, %rcx 305 jmp LABEL(loop_ashr_0) 306 307/* 308 * The following cases will be handled by ashr_1 309 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 310 * n(15) n -15 0(15 +(n-15) - n) ashr_1 311 */ 312 .p2align 4 313LABEL(ashr_1): 314 pxor %xmm0, %xmm0 315 movdqa (%rdi), %xmm2 316 movdqa (%rsi), %xmm1 317 pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 318 pslldq $15, %xmm2 /* shift first string to align with second */ 319 TOLOWER (%xmm1, %xmm2) 320 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 321 psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 322 pmovmskb %xmm2, %r9d 323 shr %cl, %edx /* adjust 0xffff for offset */ 324 shr %cl, %r9d /* adjust for 16-byte offset */ 325 sub %r9d, %edx 326 jnz LABEL(less32bytes) /* mismatch or null char seen */ 327 movdqa (%rdi), %xmm3 328 UPDATE_STRNCMP_COUNTER 329 330 pxor %xmm0, %xmm0 331 mov $16, %rcx /* index for loads*/ 332 mov $1, %r9d /* byte position left over from less32bytes case */ 333 /* 334 * Setup %r10 value allows us to detect crossing a page boundary. 335 * When %r10 goes positive we have crossed a page boundary and 336 * need to do a nibble. 337 */ 338 lea 1(%rdi), %r10 339 and $0xfff, %r10 /* offset into 4K page */ 340 sub $0x1000, %r10 /* subtract 4K pagesize */ 341 342 .p2align 4 343LABEL(loop_ashr_1): 344 add $16, %r10 345 jg LABEL(nibble_ashr_1) /* cross page boundary */ 346 347LABEL(gobble_ashr_1): 348 movdqa (%rsi, %rcx), %xmm1 349 movdqa (%rdi, %rcx), %xmm2 350 movdqa %xmm2, %xmm4 /* store for next cycle */ 351 352#ifndef USE_SSSE3 353 psrldq $1, %xmm3 354 pslldq $15, %xmm2 355 por %xmm3, %xmm2 /* merge into one 16byte value */ 356#else 357 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 358#endif 359 TOLOWER (%xmm1, %xmm2) 360 361 pcmpeqb %xmm1, %xmm0 362 pcmpeqb %xmm2, %xmm1 363 psubb %xmm0, %xmm1 364 pmovmskb %xmm1, %edx 365 sub $0xffff, %edx 366 jnz LABEL(exit) 367 368#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 369 sub $16, %r11 370 jbe LABEL(strcmp_exitz) 371#endif 372 add $16, %rcx 373 movdqa %xmm4, %xmm3 374 375 add $16, %r10 376 jg LABEL(nibble_ashr_1) /* cross page boundary */ 377 378 movdqa (%rsi, %rcx), %xmm1 379 movdqa (%rdi, %rcx), %xmm2 380 movdqa %xmm2, %xmm4 /* store for next cycle */ 381 382#ifndef USE_SSSE3 383 psrldq $1, %xmm3 384 pslldq $15, %xmm2 385 por %xmm3, %xmm2 /* merge into one 16byte value */ 386#else 387 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 388#endif 389 TOLOWER (%xmm1, %xmm2) 390 391 pcmpeqb %xmm1, %xmm0 392 pcmpeqb %xmm2, %xmm1 393 psubb %xmm0, %xmm1 394 pmovmskb %xmm1, %edx 395 sub $0xffff, %edx 396 jnz LABEL(exit) 397 398#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 399 sub $16, %r11 400 jbe LABEL(strcmp_exitz) 401#endif 402 add $16, %rcx 403 movdqa %xmm4, %xmm3 404 jmp LABEL(loop_ashr_1) 405 406 /* 407 * Nibble avoids loads across page boundary. This is to avoid a potential 408 * access into unmapped memory. 409 */ 410 .p2align 4 411LABEL(nibble_ashr_1): 412 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ 413 pmovmskb %xmm0, %edx 414 test $0xfffe, %edx 415 jnz LABEL(ashr_1_exittail) /* find null char*/ 416 417#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 418 cmp $15, %r11 419 jbe LABEL(ashr_1_exittail) 420#endif 421 422 pxor %xmm0, %xmm0 423 sub $0x1000, %r10 /* substract 4K from %r10 */ 424 jmp LABEL(gobble_ashr_1) 425 426 /* 427 * Once find null char, determine if there is a string mismatch 428 * before the null char. 429 */ 430 .p2align 4 431LABEL(ashr_1_exittail): 432 movdqa (%rsi, %rcx), %xmm1 433 psrldq $1, %xmm0 434 psrldq $1, %xmm3 435 jmp LABEL(aftertail) 436 437/* 438 * The following cases will be handled by ashr_2 439 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 440 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 441 */ 442 .p2align 4 443LABEL(ashr_2): 444 pxor %xmm0, %xmm0 445 movdqa (%rdi), %xmm2 446 movdqa (%rsi), %xmm1 447 pcmpeqb %xmm1, %xmm0 448 pslldq $14, %xmm2 449 TOLOWER (%xmm1, %xmm2) 450 pcmpeqb %xmm1, %xmm2 451 psubb %xmm0, %xmm2 452 pmovmskb %xmm2, %r9d 453 shr %cl, %edx 454 shr %cl, %r9d 455 sub %r9d, %edx 456 jnz LABEL(less32bytes) 457 movdqa (%rdi), %xmm3 458 UPDATE_STRNCMP_COUNTER 459 460 pxor %xmm0, %xmm0 461 mov $16, %rcx /* index for loads */ 462 mov $2, %r9d /* byte position left over from less32bytes case */ 463 /* 464 * Setup %r10 value allows us to detect crossing a page boundary. 465 * When %r10 goes positive we have crossed a page boundary and 466 * need to do a nibble. 467 */ 468 lea 2(%rdi), %r10 469 and $0xfff, %r10 /* offset into 4K page */ 470 sub $0x1000, %r10 /* subtract 4K pagesize */ 471 472 .p2align 4 473LABEL(loop_ashr_2): 474 add $16, %r10 475 jg LABEL(nibble_ashr_2) 476 477LABEL(gobble_ashr_2): 478 movdqa (%rsi, %rcx), %xmm1 479 movdqa (%rdi, %rcx), %xmm2 480 movdqa %xmm2, %xmm4 481 482#ifndef USE_SSSE3 483 psrldq $2, %xmm3 484 pslldq $14, %xmm2 485 por %xmm3, %xmm2 /* merge into one 16byte value */ 486#else 487 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 488#endif 489 TOLOWER (%xmm1, %xmm2) 490 491 pcmpeqb %xmm1, %xmm0 492 pcmpeqb %xmm2, %xmm1 493 psubb %xmm0, %xmm1 494 pmovmskb %xmm1, %edx 495 sub $0xffff, %edx 496 jnz LABEL(exit) 497 498#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 499 sub $16, %r11 500 jbe LABEL(strcmp_exitz) 501#endif 502 503 add $16, %rcx 504 movdqa %xmm4, %xmm3 505 506 add $16, %r10 507 jg LABEL(nibble_ashr_2) /* cross page boundary */ 508 509 movdqa (%rsi, %rcx), %xmm1 510 movdqa (%rdi, %rcx), %xmm2 511 movdqa %xmm2, %xmm4 512 513#ifndef USE_SSSE3 514 psrldq $2, %xmm3 515 pslldq $14, %xmm2 516 por %xmm3, %xmm2 /* merge into one 16byte value */ 517#else 518 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 519#endif 520 TOLOWER (%xmm1, %xmm2) 521 522 pcmpeqb %xmm1, %xmm0 523 pcmpeqb %xmm2, %xmm1 524 psubb %xmm0, %xmm1 525 pmovmskb %xmm1, %edx 526 sub $0xffff, %edx 527 jnz LABEL(exit) 528 529#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 530 sub $16, %r11 531 jbe LABEL(strcmp_exitz) 532#endif 533 534 add $16, %rcx 535 movdqa %xmm4, %xmm3 536 jmp LABEL(loop_ashr_2) 537 538 .p2align 4 539LABEL(nibble_ashr_2): 540 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 541 pmovmskb %xmm0, %edx 542 test $0xfffc, %edx 543 jnz LABEL(ashr_2_exittail) 544 545#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 546 cmp $14, %r11 547 jbe LABEL(ashr_2_exittail) 548#endif 549 550 pxor %xmm0, %xmm0 551 sub $0x1000, %r10 552 jmp LABEL(gobble_ashr_2) 553 554 .p2align 4 555LABEL(ashr_2_exittail): 556 movdqa (%rsi, %rcx), %xmm1 557 psrldq $2, %xmm0 558 psrldq $2, %xmm3 559 jmp LABEL(aftertail) 560 561/* 562 * The following cases will be handled by ashr_3 563 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 564 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 565 */ 566 .p2align 4 567LABEL(ashr_3): 568 pxor %xmm0, %xmm0 569 movdqa (%rdi), %xmm2 570 movdqa (%rsi), %xmm1 571 pcmpeqb %xmm1, %xmm0 572 pslldq $13, %xmm2 573 TOLOWER (%xmm1, %xmm2) 574 pcmpeqb %xmm1, %xmm2 575 psubb %xmm0, %xmm2 576 pmovmskb %xmm2, %r9d 577 shr %cl, %edx 578 shr %cl, %r9d 579 sub %r9d, %edx 580 jnz LABEL(less32bytes) 581 movdqa (%rdi), %xmm3 582 583 UPDATE_STRNCMP_COUNTER 584 585 pxor %xmm0, %xmm0 586 mov $16, %rcx /* index for loads */ 587 mov $3, %r9d /* byte position left over from less32bytes case */ 588 /* 589 * Setup %r10 value allows us to detect crossing a page boundary. 590 * When %r10 goes positive we have crossed a page boundary and 591 * need to do a nibble. 592 */ 593 lea 3(%rdi), %r10 594 and $0xfff, %r10 /* offset into 4K page */ 595 sub $0x1000, %r10 /* subtract 4K pagesize */ 596 597 .p2align 4 598LABEL(loop_ashr_3): 599 add $16, %r10 600 jg LABEL(nibble_ashr_3) 601 602LABEL(gobble_ashr_3): 603 movdqa (%rsi, %rcx), %xmm1 604 movdqa (%rdi, %rcx), %xmm2 605 movdqa %xmm2, %xmm4 606 607#ifndef USE_SSSE3 608 psrldq $3, %xmm3 609 pslldq $13, %xmm2 610 por %xmm3, %xmm2 /* merge into one 16byte value */ 611#else 612 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 613#endif 614 TOLOWER (%xmm1, %xmm2) 615 616 pcmpeqb %xmm1, %xmm0 617 pcmpeqb %xmm2, %xmm1 618 psubb %xmm0, %xmm1 619 pmovmskb %xmm1, %edx 620 sub $0xffff, %edx 621 jnz LABEL(exit) 622 623#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 624 sub $16, %r11 625 jbe LABEL(strcmp_exitz) 626#endif 627 628 add $16, %rcx 629 movdqa %xmm4, %xmm3 630 631 add $16, %r10 632 jg LABEL(nibble_ashr_3) /* cross page boundary */ 633 634 movdqa (%rsi, %rcx), %xmm1 635 movdqa (%rdi, %rcx), %xmm2 636 movdqa %xmm2, %xmm4 637 638#ifndef USE_SSSE3 639 psrldq $3, %xmm3 640 pslldq $13, %xmm2 641 por %xmm3, %xmm2 /* merge into one 16byte value */ 642#else 643 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 644#endif 645 TOLOWER (%xmm1, %xmm2) 646 647 pcmpeqb %xmm1, %xmm0 648 pcmpeqb %xmm2, %xmm1 649 psubb %xmm0, %xmm1 650 pmovmskb %xmm1, %edx 651 sub $0xffff, %edx 652 jnz LABEL(exit) 653 654#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 655 sub $16, %r11 656 jbe LABEL(strcmp_exitz) 657#endif 658 659 add $16, %rcx 660 movdqa %xmm4, %xmm3 661 jmp LABEL(loop_ashr_3) 662 663 .p2align 4 664LABEL(nibble_ashr_3): 665 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 666 pmovmskb %xmm0, %edx 667 test $0xfff8, %edx 668 jnz LABEL(ashr_3_exittail) 669 670#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 671 cmp $13, %r11 672 jbe LABEL(ashr_3_exittail) 673#endif 674 675 pxor %xmm0, %xmm0 676 sub $0x1000, %r10 677 jmp LABEL(gobble_ashr_3) 678 679 .p2align 4 680LABEL(ashr_3_exittail): 681 movdqa (%rsi, %rcx), %xmm1 682 psrldq $3, %xmm0 683 psrldq $3, %xmm3 684 jmp LABEL(aftertail) 685 686/* 687 * The following cases will be handled by ashr_4 688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 689 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 690 */ 691 .p2align 4 692LABEL(ashr_4): 693 pxor %xmm0, %xmm0 694 movdqa (%rdi), %xmm2 695 movdqa (%rsi), %xmm1 696 pcmpeqb %xmm1, %xmm0 697 pslldq $12, %xmm2 698 TOLOWER (%xmm1, %xmm2) 699 pcmpeqb %xmm1, %xmm2 700 psubb %xmm0, %xmm2 701 pmovmskb %xmm2, %r9d 702 shr %cl, %edx 703 shr %cl, %r9d 704 sub %r9d, %edx 705 jnz LABEL(less32bytes) 706 movdqa (%rdi), %xmm3 707 708 UPDATE_STRNCMP_COUNTER 709 710 pxor %xmm0, %xmm0 711 mov $16, %rcx /* index for loads */ 712 mov $4, %r9d /* byte position left over from less32bytes case */ 713 /* 714 * Setup %r10 value allows us to detect crossing a page boundary. 715 * When %r10 goes positive we have crossed a page boundary and 716 * need to do a nibble. 717 */ 718 lea 4(%rdi), %r10 719 and $0xfff, %r10 /* offset into 4K page */ 720 sub $0x1000, %r10 /* subtract 4K pagesize */ 721 722 .p2align 4 723LABEL(loop_ashr_4): 724 add $16, %r10 725 jg LABEL(nibble_ashr_4) 726 727LABEL(gobble_ashr_4): 728 movdqa (%rsi, %rcx), %xmm1 729 movdqa (%rdi, %rcx), %xmm2 730 movdqa %xmm2, %xmm4 731 732#ifndef USE_SSSE3 733 psrldq $4, %xmm3 734 pslldq $12, %xmm2 735 por %xmm3, %xmm2 /* merge into one 16byte value */ 736#else 737 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 738#endif 739 TOLOWER (%xmm1, %xmm2) 740 741 pcmpeqb %xmm1, %xmm0 742 pcmpeqb %xmm2, %xmm1 743 psubb %xmm0, %xmm1 744 pmovmskb %xmm1, %edx 745 sub $0xffff, %edx 746 jnz LABEL(exit) 747 748#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 749 sub $16, %r11 750 jbe LABEL(strcmp_exitz) 751#endif 752 753 add $16, %rcx 754 movdqa %xmm4, %xmm3 755 756 add $16, %r10 757 jg LABEL(nibble_ashr_4) /* cross page boundary */ 758 759 movdqa (%rsi, %rcx), %xmm1 760 movdqa (%rdi, %rcx), %xmm2 761 movdqa %xmm2, %xmm4 762 763#ifndef USE_SSSE3 764 psrldq $4, %xmm3 765 pslldq $12, %xmm2 766 por %xmm3, %xmm2 /* merge into one 16byte value */ 767#else 768 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 769#endif 770 TOLOWER (%xmm1, %xmm2) 771 772 pcmpeqb %xmm1, %xmm0 773 pcmpeqb %xmm2, %xmm1 774 psubb %xmm0, %xmm1 775 pmovmskb %xmm1, %edx 776 sub $0xffff, %edx 777 jnz LABEL(exit) 778 779#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 780 sub $16, %r11 781 jbe LABEL(strcmp_exitz) 782#endif 783 784 add $16, %rcx 785 movdqa %xmm4, %xmm3 786 jmp LABEL(loop_ashr_4) 787 788 .p2align 4 789LABEL(nibble_ashr_4): 790 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 791 pmovmskb %xmm0, %edx 792 test $0xfff0, %edx 793 jnz LABEL(ashr_4_exittail) 794 795#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 796 cmp $12, %r11 797 jbe LABEL(ashr_4_exittail) 798#endif 799 800 pxor %xmm0, %xmm0 801 sub $0x1000, %r10 802 jmp LABEL(gobble_ashr_4) 803 804 .p2align 4 805LABEL(ashr_4_exittail): 806 movdqa (%rsi, %rcx), %xmm1 807 psrldq $4, %xmm0 808 psrldq $4, %xmm3 809 jmp LABEL(aftertail) 810 811/* 812 * The following cases will be handled by ashr_5 813 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 814 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 815 */ 816 .p2align 4 817LABEL(ashr_5): 818 pxor %xmm0, %xmm0 819 movdqa (%rdi), %xmm2 820 movdqa (%rsi), %xmm1 821 pcmpeqb %xmm1, %xmm0 822 pslldq $11, %xmm2 823 TOLOWER (%xmm1, %xmm2) 824 pcmpeqb %xmm1, %xmm2 825 psubb %xmm0, %xmm2 826 pmovmskb %xmm2, %r9d 827 shr %cl, %edx 828 shr %cl, %r9d 829 sub %r9d, %edx 830 jnz LABEL(less32bytes) 831 movdqa (%rdi), %xmm3 832 833 UPDATE_STRNCMP_COUNTER 834 835 pxor %xmm0, %xmm0 836 mov $16, %rcx /* index for loads */ 837 mov $5, %r9d /* byte position left over from less32bytes case */ 838 /* 839 * Setup %r10 value allows us to detect crossing a page boundary. 840 * When %r10 goes positive we have crossed a page boundary and 841 * need to do a nibble. 842 */ 843 lea 5(%rdi), %r10 844 and $0xfff, %r10 /* offset into 4K page */ 845 sub $0x1000, %r10 /* subtract 4K pagesize */ 846 847 .p2align 4 848LABEL(loop_ashr_5): 849 add $16, %r10 850 jg LABEL(nibble_ashr_5) 851 852LABEL(gobble_ashr_5): 853 movdqa (%rsi, %rcx), %xmm1 854 movdqa (%rdi, %rcx), %xmm2 855 movdqa %xmm2, %xmm4 856 857#ifndef USE_SSSE3 858 psrldq $5, %xmm3 859 pslldq $11, %xmm2 860 por %xmm3, %xmm2 /* merge into one 16byte value */ 861#else 862 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 863#endif 864 TOLOWER (%xmm1, %xmm2) 865 866 pcmpeqb %xmm1, %xmm0 867 pcmpeqb %xmm2, %xmm1 868 psubb %xmm0, %xmm1 869 pmovmskb %xmm1, %edx 870 sub $0xffff, %edx 871 jnz LABEL(exit) 872 873#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 874 sub $16, %r11 875 jbe LABEL(strcmp_exitz) 876#endif 877 878 add $16, %rcx 879 movdqa %xmm4, %xmm3 880 881 add $16, %r10 882 jg LABEL(nibble_ashr_5) /* cross page boundary */ 883 884 movdqa (%rsi, %rcx), %xmm1 885 movdqa (%rdi, %rcx), %xmm2 886 movdqa %xmm2, %xmm4 887 888#ifndef USE_SSSE3 889 psrldq $5, %xmm3 890 pslldq $11, %xmm2 891 por %xmm3, %xmm2 /* merge into one 16byte value */ 892#else 893 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 894#endif 895 TOLOWER (%xmm1, %xmm2) 896 897 pcmpeqb %xmm1, %xmm0 898 pcmpeqb %xmm2, %xmm1 899 psubb %xmm0, %xmm1 900 pmovmskb %xmm1, %edx 901 sub $0xffff, %edx 902 jnz LABEL(exit) 903 904#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 905 sub $16, %r11 906 jbe LABEL(strcmp_exitz) 907#endif 908 909 add $16, %rcx 910 movdqa %xmm4, %xmm3 911 jmp LABEL(loop_ashr_5) 912 913 .p2align 4 914LABEL(nibble_ashr_5): 915 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 916 pmovmskb %xmm0, %edx 917 test $0xffe0, %edx 918 jnz LABEL(ashr_5_exittail) 919 920#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 921 cmp $11, %r11 922 jbe LABEL(ashr_5_exittail) 923#endif 924 925 pxor %xmm0, %xmm0 926 sub $0x1000, %r10 927 jmp LABEL(gobble_ashr_5) 928 929 .p2align 4 930LABEL(ashr_5_exittail): 931 movdqa (%rsi, %rcx), %xmm1 932 psrldq $5, %xmm0 933 psrldq $5, %xmm3 934 jmp LABEL(aftertail) 935 936/* 937 * The following cases will be handled by ashr_6 938 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 939 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 940 */ 941 .p2align 4 942LABEL(ashr_6): 943 pxor %xmm0, %xmm0 944 movdqa (%rdi), %xmm2 945 movdqa (%rsi), %xmm1 946 pcmpeqb %xmm1, %xmm0 947 pslldq $10, %xmm2 948 TOLOWER (%xmm1, %xmm2) 949 pcmpeqb %xmm1, %xmm2 950 psubb %xmm0, %xmm2 951 pmovmskb %xmm2, %r9d 952 shr %cl, %edx 953 shr %cl, %r9d 954 sub %r9d, %edx 955 jnz LABEL(less32bytes) 956 movdqa (%rdi), %xmm3 957 958 UPDATE_STRNCMP_COUNTER 959 960 pxor %xmm0, %xmm0 961 mov $16, %rcx /* index for loads */ 962 mov $6, %r9d /* byte position left over from less32bytes case */ 963 /* 964 * Setup %r10 value allows us to detect crossing a page boundary. 965 * When %r10 goes positive we have crossed a page boundary and 966 * need to do a nibble. 967 */ 968 lea 6(%rdi), %r10 969 and $0xfff, %r10 /* offset into 4K page */ 970 sub $0x1000, %r10 /* subtract 4K pagesize */ 971 972 .p2align 4 973LABEL(loop_ashr_6): 974 add $16, %r10 975 jg LABEL(nibble_ashr_6) 976 977LABEL(gobble_ashr_6): 978 movdqa (%rsi, %rcx), %xmm1 979 movdqa (%rdi, %rcx), %xmm2 980 movdqa %xmm2, %xmm4 981 982#ifndef USE_SSSE3 983 psrldq $6, %xmm3 984 pslldq $10, %xmm2 985 por %xmm3, %xmm2 /* merge into one 16byte value */ 986#else 987 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 988#endif 989 TOLOWER (%xmm1, %xmm2) 990 991 pcmpeqb %xmm1, %xmm0 992 pcmpeqb %xmm2, %xmm1 993 psubb %xmm0, %xmm1 994 pmovmskb %xmm1, %edx 995 sub $0xffff, %edx 996 jnz LABEL(exit) 997 998#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 999 sub $16, %r11 1000 jbe LABEL(strcmp_exitz) 1001#endif 1002 1003 add $16, %rcx 1004 movdqa %xmm4, %xmm3 1005 1006 add $16, %r10 1007 jg LABEL(nibble_ashr_6) /* cross page boundary */ 1008 1009 movdqa (%rsi, %rcx), %xmm1 1010 movdqa (%rdi, %rcx), %xmm2 1011 movdqa %xmm2, %xmm4 1012 1013#ifndef USE_SSSE3 1014 psrldq $6, %xmm3 1015 pslldq $10, %xmm2 1016 por %xmm3, %xmm2 /* merge into one 16byte value */ 1017#else 1018 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 1019#endif 1020 TOLOWER (%xmm1, %xmm2) 1021 1022 pcmpeqb %xmm1, %xmm0 1023 pcmpeqb %xmm2, %xmm1 1024 psubb %xmm0, %xmm1 1025 pmovmskb %xmm1, %edx 1026 sub $0xffff, %edx 1027 jnz LABEL(exit) 1028 1029#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1030 sub $16, %r11 1031 jbe LABEL(strcmp_exitz) 1032#endif 1033 1034 add $16, %rcx 1035 movdqa %xmm4, %xmm3 1036 jmp LABEL(loop_ashr_6) 1037 1038 .p2align 4 1039LABEL(nibble_ashr_6): 1040 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1041 pmovmskb %xmm0, %edx 1042 test $0xffc0, %edx 1043 jnz LABEL(ashr_6_exittail) 1044 1045#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1046 cmp $10, %r11 1047 jbe LABEL(ashr_6_exittail) 1048#endif 1049 1050 pxor %xmm0, %xmm0 1051 sub $0x1000, %r10 1052 jmp LABEL(gobble_ashr_6) 1053 1054 .p2align 4 1055LABEL(ashr_6_exittail): 1056 movdqa (%rsi, %rcx), %xmm1 1057 psrldq $6, %xmm0 1058 psrldq $6, %xmm3 1059 jmp LABEL(aftertail) 1060 1061/* 1062 * The following cases will be handled by ashr_7 1063 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1064 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 1065 */ 1066 .p2align 4 1067LABEL(ashr_7): 1068 pxor %xmm0, %xmm0 1069 movdqa (%rdi), %xmm2 1070 movdqa (%rsi), %xmm1 1071 pcmpeqb %xmm1, %xmm0 1072 pslldq $9, %xmm2 1073 TOLOWER (%xmm1, %xmm2) 1074 pcmpeqb %xmm1, %xmm2 1075 psubb %xmm0, %xmm2 1076 pmovmskb %xmm2, %r9d 1077 shr %cl, %edx 1078 shr %cl, %r9d 1079 sub %r9d, %edx 1080 jnz LABEL(less32bytes) 1081 movdqa (%rdi), %xmm3 1082 1083 UPDATE_STRNCMP_COUNTER 1084 1085 pxor %xmm0, %xmm0 1086 mov $16, %rcx /* index for loads */ 1087 mov $7, %r9d /* byte position left over from less32bytes case */ 1088 /* 1089 * Setup %r10 value allows us to detect crossing a page boundary. 1090 * When %r10 goes positive we have crossed a page boundary and 1091 * need to do a nibble. 1092 */ 1093 lea 7(%rdi), %r10 1094 and $0xfff, %r10 /* offset into 4K page */ 1095 sub $0x1000, %r10 /* subtract 4K pagesize */ 1096 1097 .p2align 4 1098LABEL(loop_ashr_7): 1099 add $16, %r10 1100 jg LABEL(nibble_ashr_7) 1101 1102LABEL(gobble_ashr_7): 1103 movdqa (%rsi, %rcx), %xmm1 1104 movdqa (%rdi, %rcx), %xmm2 1105 movdqa %xmm2, %xmm4 1106 1107#ifndef USE_SSSE3 1108 psrldq $7, %xmm3 1109 pslldq $9, %xmm2 1110 por %xmm3, %xmm2 /* merge into one 16byte value */ 1111#else 1112 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 1113#endif 1114 TOLOWER (%xmm1, %xmm2) 1115 1116 pcmpeqb %xmm1, %xmm0 1117 pcmpeqb %xmm2, %xmm1 1118 psubb %xmm0, %xmm1 1119 pmovmskb %xmm1, %edx 1120 sub $0xffff, %edx 1121 jnz LABEL(exit) 1122 1123#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1124 sub $16, %r11 1125 jbe LABEL(strcmp_exitz) 1126#endif 1127 1128 add $16, %rcx 1129 movdqa %xmm4, %xmm3 1130 1131 add $16, %r10 1132 jg LABEL(nibble_ashr_7) /* cross page boundary */ 1133 1134 movdqa (%rsi, %rcx), %xmm1 1135 movdqa (%rdi, %rcx), %xmm2 1136 movdqa %xmm2, %xmm4 1137 1138#ifndef USE_SSSE3 1139 psrldq $7, %xmm3 1140 pslldq $9, %xmm2 1141 por %xmm3, %xmm2 /* merge into one 16byte value */ 1142#else 1143 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 1144#endif 1145 TOLOWER (%xmm1, %xmm2) 1146 1147 pcmpeqb %xmm1, %xmm0 1148 pcmpeqb %xmm2, %xmm1 1149 psubb %xmm0, %xmm1 1150 pmovmskb %xmm1, %edx 1151 sub $0xffff, %edx 1152 jnz LABEL(exit) 1153 1154#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1155 sub $16, %r11 1156 jbe LABEL(strcmp_exitz) 1157#endif 1158 1159 add $16, %rcx 1160 movdqa %xmm4, %xmm3 1161 jmp LABEL(loop_ashr_7) 1162 1163 .p2align 4 1164LABEL(nibble_ashr_7): 1165 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1166 pmovmskb %xmm0, %edx 1167 test $0xff80, %edx 1168 jnz LABEL(ashr_7_exittail) 1169 1170#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1171 cmp $9, %r11 1172 jbe LABEL(ashr_7_exittail) 1173#endif 1174 1175 pxor %xmm0, %xmm0 1176 sub $0x1000, %r10 1177 jmp LABEL(gobble_ashr_7) 1178 1179 .p2align 4 1180LABEL(ashr_7_exittail): 1181 movdqa (%rsi, %rcx), %xmm1 1182 psrldq $7, %xmm0 1183 psrldq $7, %xmm3 1184 jmp LABEL(aftertail) 1185 1186/* 1187 * The following cases will be handled by ashr_8 1188 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1189 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 1190 */ 1191 .p2align 4 1192LABEL(ashr_8): 1193 pxor %xmm0, %xmm0 1194 movdqa (%rdi), %xmm2 1195 movdqa (%rsi), %xmm1 1196 pcmpeqb %xmm1, %xmm0 1197 pslldq $8, %xmm2 1198 TOLOWER (%xmm1, %xmm2) 1199 pcmpeqb %xmm1, %xmm2 1200 psubb %xmm0, %xmm2 1201 pmovmskb %xmm2, %r9d 1202 shr %cl, %edx 1203 shr %cl, %r9d 1204 sub %r9d, %edx 1205 jnz LABEL(less32bytes) 1206 movdqa (%rdi), %xmm3 1207 1208 UPDATE_STRNCMP_COUNTER 1209 1210 pxor %xmm0, %xmm0 1211 mov $16, %rcx /* index for loads */ 1212 mov $8, %r9d /* byte position left over from less32bytes case */ 1213 /* 1214 * Setup %r10 value allows us to detect crossing a page boundary. 1215 * When %r10 goes positive we have crossed a page boundary and 1216 * need to do a nibble. 1217 */ 1218 lea 8(%rdi), %r10 1219 and $0xfff, %r10 /* offset into 4K page */ 1220 sub $0x1000, %r10 /* subtract 4K pagesize */ 1221 1222 .p2align 4 1223LABEL(loop_ashr_8): 1224 add $16, %r10 1225 jg LABEL(nibble_ashr_8) 1226 1227LABEL(gobble_ashr_8): 1228 movdqa (%rsi, %rcx), %xmm1 1229 movdqa (%rdi, %rcx), %xmm2 1230 movdqa %xmm2, %xmm4 1231 1232#ifndef USE_SSSE3 1233 psrldq $8, %xmm3 1234 pslldq $8, %xmm2 1235 por %xmm3, %xmm2 /* merge into one 16byte value */ 1236#else 1237 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1238#endif 1239 TOLOWER (%xmm1, %xmm2) 1240 1241 pcmpeqb %xmm1, %xmm0 1242 pcmpeqb %xmm2, %xmm1 1243 psubb %xmm0, %xmm1 1244 pmovmskb %xmm1, %edx 1245 sub $0xffff, %edx 1246 jnz LABEL(exit) 1247 1248#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1249 sub $16, %r11 1250 jbe LABEL(strcmp_exitz) 1251#endif 1252 1253 add $16, %rcx 1254 movdqa %xmm4, %xmm3 1255 1256 add $16, %r10 1257 jg LABEL(nibble_ashr_8) /* cross page boundary */ 1258 1259 movdqa (%rsi, %rcx), %xmm1 1260 movdqa (%rdi, %rcx), %xmm2 1261 movdqa %xmm2, %xmm4 1262 1263#ifndef USE_SSSE3 1264 psrldq $8, %xmm3 1265 pslldq $8, %xmm2 1266 por %xmm3, %xmm2 /* merge into one 16byte value */ 1267#else 1268 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1269#endif 1270 TOLOWER (%xmm1, %xmm2) 1271 1272 pcmpeqb %xmm1, %xmm0 1273 pcmpeqb %xmm2, %xmm1 1274 psubb %xmm0, %xmm1 1275 pmovmskb %xmm1, %edx 1276 sub $0xffff, %edx 1277 jnz LABEL(exit) 1278 1279#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1280 sub $16, %r11 1281 jbe LABEL(strcmp_exitz) 1282#endif 1283 1284 add $16, %rcx 1285 movdqa %xmm4, %xmm3 1286 jmp LABEL(loop_ashr_8) 1287 1288 .p2align 4 1289LABEL(nibble_ashr_8): 1290 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1291 pmovmskb %xmm0, %edx 1292 test $0xff00, %edx 1293 jnz LABEL(ashr_8_exittail) 1294 1295#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1296 cmp $8, %r11 1297 jbe LABEL(ashr_8_exittail) 1298#endif 1299 1300 pxor %xmm0, %xmm0 1301 sub $0x1000, %r10 1302 jmp LABEL(gobble_ashr_8) 1303 1304 .p2align 4 1305LABEL(ashr_8_exittail): 1306 movdqa (%rsi, %rcx), %xmm1 1307 psrldq $8, %xmm0 1308 psrldq $8, %xmm3 1309 jmp LABEL(aftertail) 1310 1311/* 1312 * The following cases will be handled by ashr_9 1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1314 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 1315 */ 1316 .p2align 4 1317LABEL(ashr_9): 1318 pxor %xmm0, %xmm0 1319 movdqa (%rdi), %xmm2 1320 movdqa (%rsi), %xmm1 1321 pcmpeqb %xmm1, %xmm0 1322 pslldq $7, %xmm2 1323 TOLOWER (%xmm1, %xmm2) 1324 pcmpeqb %xmm1, %xmm2 1325 psubb %xmm0, %xmm2 1326 pmovmskb %xmm2, %r9d 1327 shr %cl, %edx 1328 shr %cl, %r9d 1329 sub %r9d, %edx 1330 jnz LABEL(less32bytes) 1331 movdqa (%rdi), %xmm3 1332 1333 UPDATE_STRNCMP_COUNTER 1334 1335 pxor %xmm0, %xmm0 1336 mov $16, %rcx /* index for loads */ 1337 mov $9, %r9d /* byte position left over from less32bytes case */ 1338 /* 1339 * Setup %r10 value allows us to detect crossing a page boundary. 1340 * When %r10 goes positive we have crossed a page boundary and 1341 * need to do a nibble. 1342 */ 1343 lea 9(%rdi), %r10 1344 and $0xfff, %r10 /* offset into 4K page */ 1345 sub $0x1000, %r10 /* subtract 4K pagesize */ 1346 1347 .p2align 4 1348LABEL(loop_ashr_9): 1349 add $16, %r10 1350 jg LABEL(nibble_ashr_9) 1351 1352LABEL(gobble_ashr_9): 1353 movdqa (%rsi, %rcx), %xmm1 1354 movdqa (%rdi, %rcx), %xmm2 1355 movdqa %xmm2, %xmm4 1356 1357#ifndef USE_SSSE3 1358 psrldq $9, %xmm3 1359 pslldq $7, %xmm2 1360 por %xmm3, %xmm2 /* merge into one 16byte value */ 1361#else 1362 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1363#endif 1364 TOLOWER (%xmm1, %xmm2) 1365 1366 pcmpeqb %xmm1, %xmm0 1367 pcmpeqb %xmm2, %xmm1 1368 psubb %xmm0, %xmm1 1369 pmovmskb %xmm1, %edx 1370 sub $0xffff, %edx 1371 jnz LABEL(exit) 1372 1373#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1374 sub $16, %r11 1375 jbe LABEL(strcmp_exitz) 1376#endif 1377 1378 add $16, %rcx 1379 movdqa %xmm4, %xmm3 1380 1381 add $16, %r10 1382 jg LABEL(nibble_ashr_9) /* cross page boundary */ 1383 1384 movdqa (%rsi, %rcx), %xmm1 1385 movdqa (%rdi, %rcx), %xmm2 1386 movdqa %xmm2, %xmm4 1387 1388#ifndef USE_SSSE3 1389 psrldq $9, %xmm3 1390 pslldq $7, %xmm2 1391 por %xmm3, %xmm2 /* merge into one 16byte value */ 1392#else 1393 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1394#endif 1395 TOLOWER (%xmm1, %xmm2) 1396 1397 pcmpeqb %xmm1, %xmm0 1398 pcmpeqb %xmm2, %xmm1 1399 psubb %xmm0, %xmm1 1400 pmovmskb %xmm1, %edx 1401 sub $0xffff, %edx 1402 jnz LABEL(exit) 1403 1404#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1405 sub $16, %r11 1406 jbe LABEL(strcmp_exitz) 1407#endif 1408 1409 add $16, %rcx 1410 movdqa %xmm4, %xmm3 /* store for next cycle */ 1411 jmp LABEL(loop_ashr_9) 1412 1413 .p2align 4 1414LABEL(nibble_ashr_9): 1415 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1416 pmovmskb %xmm0, %edx 1417 test $0xfe00, %edx 1418 jnz LABEL(ashr_9_exittail) 1419 1420#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1421 cmp $7, %r11 1422 jbe LABEL(ashr_9_exittail) 1423#endif 1424 1425 pxor %xmm0, %xmm0 1426 sub $0x1000, %r10 1427 jmp LABEL(gobble_ashr_9) 1428 1429 .p2align 4 1430LABEL(ashr_9_exittail): 1431 movdqa (%rsi, %rcx), %xmm1 1432 psrldq $9, %xmm0 1433 psrldq $9, %xmm3 1434 jmp LABEL(aftertail) 1435 1436/* 1437 * The following cases will be handled by ashr_10 1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1439 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 1440 */ 1441 .p2align 4 1442LABEL(ashr_10): 1443 pxor %xmm0, %xmm0 1444 movdqa (%rdi), %xmm2 1445 movdqa (%rsi), %xmm1 1446 pcmpeqb %xmm1, %xmm0 1447 pslldq $6, %xmm2 1448 TOLOWER (%xmm1, %xmm2) 1449 pcmpeqb %xmm1, %xmm2 1450 psubb %xmm0, %xmm2 1451 pmovmskb %xmm2, %r9d 1452 shr %cl, %edx 1453 shr %cl, %r9d 1454 sub %r9d, %edx 1455 jnz LABEL(less32bytes) 1456 movdqa (%rdi), %xmm3 1457 1458 UPDATE_STRNCMP_COUNTER 1459 1460 pxor %xmm0, %xmm0 1461 mov $16, %rcx /* index for loads */ 1462 mov $10, %r9d /* byte position left over from less32bytes case */ 1463 /* 1464 * Setup %r10 value allows us to detect crossing a page boundary. 1465 * When %r10 goes positive we have crossed a page boundary and 1466 * need to do a nibble. 1467 */ 1468 lea 10(%rdi), %r10 1469 and $0xfff, %r10 /* offset into 4K page */ 1470 sub $0x1000, %r10 /* subtract 4K pagesize */ 1471 1472 .p2align 4 1473LABEL(loop_ashr_10): 1474 add $16, %r10 1475 jg LABEL(nibble_ashr_10) 1476 1477LABEL(gobble_ashr_10): 1478 movdqa (%rsi, %rcx), %xmm1 1479 movdqa (%rdi, %rcx), %xmm2 1480 movdqa %xmm2, %xmm4 1481 1482#ifndef USE_SSSE3 1483 psrldq $10, %xmm3 1484 pslldq $6, %xmm2 1485 por %xmm3, %xmm2 /* merge into one 16byte value */ 1486#else 1487 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1488#endif 1489 TOLOWER (%xmm1, %xmm2) 1490 1491 pcmpeqb %xmm1, %xmm0 1492 pcmpeqb %xmm2, %xmm1 1493 psubb %xmm0, %xmm1 1494 pmovmskb %xmm1, %edx 1495 sub $0xffff, %edx 1496 jnz LABEL(exit) 1497 1498#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1499 sub $16, %r11 1500 jbe LABEL(strcmp_exitz) 1501#endif 1502 1503 add $16, %rcx 1504 movdqa %xmm4, %xmm3 1505 1506 add $16, %r10 1507 jg LABEL(nibble_ashr_10) /* cross page boundary */ 1508 1509 movdqa (%rsi, %rcx), %xmm1 1510 movdqa (%rdi, %rcx), %xmm2 1511 movdqa %xmm2, %xmm4 1512 1513#ifndef USE_SSSE3 1514 psrldq $10, %xmm3 1515 pslldq $6, %xmm2 1516 por %xmm3, %xmm2 /* merge into one 16byte value */ 1517#else 1518 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1519#endif 1520 TOLOWER (%xmm1, %xmm2) 1521 1522 pcmpeqb %xmm1, %xmm0 1523 pcmpeqb %xmm2, %xmm1 1524 psubb %xmm0, %xmm1 1525 pmovmskb %xmm1, %edx 1526 sub $0xffff, %edx 1527 jnz LABEL(exit) 1528 1529#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1530 sub $16, %r11 1531 jbe LABEL(strcmp_exitz) 1532#endif 1533 1534 add $16, %rcx 1535 movdqa %xmm4, %xmm3 1536 jmp LABEL(loop_ashr_10) 1537 1538 .p2align 4 1539LABEL(nibble_ashr_10): 1540 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1541 pmovmskb %xmm0, %edx 1542 test $0xfc00, %edx 1543 jnz LABEL(ashr_10_exittail) 1544 1545#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1546 cmp $6, %r11 1547 jbe LABEL(ashr_10_exittail) 1548#endif 1549 1550 pxor %xmm0, %xmm0 1551 sub $0x1000, %r10 1552 jmp LABEL(gobble_ashr_10) 1553 1554 .p2align 4 1555LABEL(ashr_10_exittail): 1556 movdqa (%rsi, %rcx), %xmm1 1557 psrldq $10, %xmm0 1558 psrldq $10, %xmm3 1559 jmp LABEL(aftertail) 1560 1561/* 1562 * The following cases will be handled by ashr_11 1563 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1564 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 1565 */ 1566 .p2align 4 1567LABEL(ashr_11): 1568 pxor %xmm0, %xmm0 1569 movdqa (%rdi), %xmm2 1570 movdqa (%rsi), %xmm1 1571 pcmpeqb %xmm1, %xmm0 1572 pslldq $5, %xmm2 1573 TOLOWER (%xmm1, %xmm2) 1574 pcmpeqb %xmm1, %xmm2 1575 psubb %xmm0, %xmm2 1576 pmovmskb %xmm2, %r9d 1577 shr %cl, %edx 1578 shr %cl, %r9d 1579 sub %r9d, %edx 1580 jnz LABEL(less32bytes) 1581 movdqa (%rdi), %xmm3 1582 1583 UPDATE_STRNCMP_COUNTER 1584 1585 pxor %xmm0, %xmm0 1586 mov $16, %rcx /* index for loads */ 1587 mov $11, %r9d /* byte position left over from less32bytes case */ 1588 /* 1589 * Setup %r10 value allows us to detect crossing a page boundary. 1590 * When %r10 goes positive we have crossed a page boundary and 1591 * need to do a nibble. 1592 */ 1593 lea 11(%rdi), %r10 1594 and $0xfff, %r10 /* offset into 4K page */ 1595 sub $0x1000, %r10 /* subtract 4K pagesize */ 1596 1597 .p2align 4 1598LABEL(loop_ashr_11): 1599 add $16, %r10 1600 jg LABEL(nibble_ashr_11) 1601 1602LABEL(gobble_ashr_11): 1603 movdqa (%rsi, %rcx), %xmm1 1604 movdqa (%rdi, %rcx), %xmm2 1605 movdqa %xmm2, %xmm4 1606 1607#ifndef USE_SSSE3 1608 psrldq $11, %xmm3 1609 pslldq $5, %xmm2 1610 por %xmm3, %xmm2 /* merge into one 16byte value */ 1611#else 1612 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1613#endif 1614 TOLOWER (%xmm1, %xmm2) 1615 1616 pcmpeqb %xmm1, %xmm0 1617 pcmpeqb %xmm2, %xmm1 1618 psubb %xmm0, %xmm1 1619 pmovmskb %xmm1, %edx 1620 sub $0xffff, %edx 1621 jnz LABEL(exit) 1622 1623#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1624 sub $16, %r11 1625 jbe LABEL(strcmp_exitz) 1626#endif 1627 1628 add $16, %rcx 1629 movdqa %xmm4, %xmm3 1630 1631 add $16, %r10 1632 jg LABEL(nibble_ashr_11) /* cross page boundary */ 1633 1634 movdqa (%rsi, %rcx), %xmm1 1635 movdqa (%rdi, %rcx), %xmm2 1636 movdqa %xmm2, %xmm4 1637 1638#ifndef USE_SSSE3 1639 psrldq $11, %xmm3 1640 pslldq $5, %xmm2 1641 por %xmm3, %xmm2 /* merge into one 16byte value */ 1642#else 1643 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1644#endif 1645 TOLOWER (%xmm1, %xmm2) 1646 1647 pcmpeqb %xmm1, %xmm0 1648 pcmpeqb %xmm2, %xmm1 1649 psubb %xmm0, %xmm1 1650 pmovmskb %xmm1, %edx 1651 sub $0xffff, %edx 1652 jnz LABEL(exit) 1653 1654#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1655 sub $16, %r11 1656 jbe LABEL(strcmp_exitz) 1657#endif 1658 1659 add $16, %rcx 1660 movdqa %xmm4, %xmm3 1661 jmp LABEL(loop_ashr_11) 1662 1663 .p2align 4 1664LABEL(nibble_ashr_11): 1665 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1666 pmovmskb %xmm0, %edx 1667 test $0xf800, %edx 1668 jnz LABEL(ashr_11_exittail) 1669 1670#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1671 cmp $5, %r11 1672 jbe LABEL(ashr_11_exittail) 1673#endif 1674 1675 pxor %xmm0, %xmm0 1676 sub $0x1000, %r10 1677 jmp LABEL(gobble_ashr_11) 1678 1679 .p2align 4 1680LABEL(ashr_11_exittail): 1681 movdqa (%rsi, %rcx), %xmm1 1682 psrldq $11, %xmm0 1683 psrldq $11, %xmm3 1684 jmp LABEL(aftertail) 1685 1686/* 1687 * The following cases will be handled by ashr_12 1688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1689 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 1690 */ 1691 .p2align 4 1692LABEL(ashr_12): 1693 pxor %xmm0, %xmm0 1694 movdqa (%rdi), %xmm2 1695 movdqa (%rsi), %xmm1 1696 pcmpeqb %xmm1, %xmm0 1697 pslldq $4, %xmm2 1698 TOLOWER (%xmm1, %xmm2) 1699 pcmpeqb %xmm1, %xmm2 1700 psubb %xmm0, %xmm2 1701 pmovmskb %xmm2, %r9d 1702 shr %cl, %edx 1703 shr %cl, %r9d 1704 sub %r9d, %edx 1705 jnz LABEL(less32bytes) 1706 movdqa (%rdi), %xmm3 1707 1708 UPDATE_STRNCMP_COUNTER 1709 1710 pxor %xmm0, %xmm0 1711 mov $16, %rcx /* index for loads */ 1712 mov $12, %r9d /* byte position left over from less32bytes case */ 1713 /* 1714 * Setup %r10 value allows us to detect crossing a page boundary. 1715 * When %r10 goes positive we have crossed a page boundary and 1716 * need to do a nibble. 1717 */ 1718 lea 12(%rdi), %r10 1719 and $0xfff, %r10 /* offset into 4K page */ 1720 sub $0x1000, %r10 /* subtract 4K pagesize */ 1721 1722 .p2align 4 1723LABEL(loop_ashr_12): 1724 add $16, %r10 1725 jg LABEL(nibble_ashr_12) 1726 1727LABEL(gobble_ashr_12): 1728 movdqa (%rsi, %rcx), %xmm1 1729 movdqa (%rdi, %rcx), %xmm2 1730 movdqa %xmm2, %xmm4 1731 1732#ifndef USE_SSSE3 1733 psrldq $12, %xmm3 1734 pslldq $4, %xmm2 1735 por %xmm3, %xmm2 /* merge into one 16byte value */ 1736#else 1737 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1738#endif 1739 TOLOWER (%xmm1, %xmm2) 1740 1741 pcmpeqb %xmm1, %xmm0 1742 pcmpeqb %xmm2, %xmm1 1743 psubb %xmm0, %xmm1 1744 pmovmskb %xmm1, %edx 1745 sub $0xffff, %edx 1746 jnz LABEL(exit) 1747 1748#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1749 sub $16, %r11 1750 jbe LABEL(strcmp_exitz) 1751#endif 1752 1753 add $16, %rcx 1754 movdqa %xmm4, %xmm3 1755 1756 add $16, %r10 1757 jg LABEL(nibble_ashr_12) /* cross page boundary */ 1758 1759 movdqa (%rsi, %rcx), %xmm1 1760 movdqa (%rdi, %rcx), %xmm2 1761 movdqa %xmm2, %xmm4 1762 1763#ifndef USE_SSSE3 1764 psrldq $12, %xmm3 1765 pslldq $4, %xmm2 1766 por %xmm3, %xmm2 /* merge into one 16byte value */ 1767#else 1768 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1769#endif 1770 TOLOWER (%xmm1, %xmm2) 1771 1772 pcmpeqb %xmm1, %xmm0 1773 pcmpeqb %xmm2, %xmm1 1774 psubb %xmm0, %xmm1 1775 pmovmskb %xmm1, %edx 1776 sub $0xffff, %edx 1777 jnz LABEL(exit) 1778 1779#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1780 sub $16, %r11 1781 jbe LABEL(strcmp_exitz) 1782#endif 1783 1784 add $16, %rcx 1785 movdqa %xmm4, %xmm3 1786 jmp LABEL(loop_ashr_12) 1787 1788 .p2align 4 1789LABEL(nibble_ashr_12): 1790 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1791 pmovmskb %xmm0, %edx 1792 test $0xf000, %edx 1793 jnz LABEL(ashr_12_exittail) 1794 1795#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1796 cmp $4, %r11 1797 jbe LABEL(ashr_12_exittail) 1798#endif 1799 1800 pxor %xmm0, %xmm0 1801 sub $0x1000, %r10 1802 jmp LABEL(gobble_ashr_12) 1803 1804 .p2align 4 1805LABEL(ashr_12_exittail): 1806 movdqa (%rsi, %rcx), %xmm1 1807 psrldq $12, %xmm0 1808 psrldq $12, %xmm3 1809 jmp LABEL(aftertail) 1810 1811/* 1812 * The following cases will be handled by ashr_13 1813 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1814 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 1815 */ 1816 .p2align 4 1817LABEL(ashr_13): 1818 pxor %xmm0, %xmm0 1819 movdqa (%rdi), %xmm2 1820 movdqa (%rsi), %xmm1 1821 pcmpeqb %xmm1, %xmm0 1822 pslldq $3, %xmm2 1823 TOLOWER (%xmm1, %xmm2) 1824 pcmpeqb %xmm1, %xmm2 1825 psubb %xmm0, %xmm2 1826 pmovmskb %xmm2, %r9d 1827 shr %cl, %edx 1828 shr %cl, %r9d 1829 sub %r9d, %edx 1830 jnz LABEL(less32bytes) 1831 movdqa (%rdi), %xmm3 1832 1833 UPDATE_STRNCMP_COUNTER 1834 1835 pxor %xmm0, %xmm0 1836 mov $16, %rcx /* index for loads */ 1837 mov $13, %r9d /* byte position left over from less32bytes case */ 1838 /* 1839 * Setup %r10 value allows us to detect crossing a page boundary. 1840 * When %r10 goes positive we have crossed a page boundary and 1841 * need to do a nibble. 1842 */ 1843 lea 13(%rdi), %r10 1844 and $0xfff, %r10 /* offset into 4K page */ 1845 sub $0x1000, %r10 /* subtract 4K pagesize */ 1846 1847 .p2align 4 1848LABEL(loop_ashr_13): 1849 add $16, %r10 1850 jg LABEL(nibble_ashr_13) 1851 1852LABEL(gobble_ashr_13): 1853 movdqa (%rsi, %rcx), %xmm1 1854 movdqa (%rdi, %rcx), %xmm2 1855 movdqa %xmm2, %xmm4 1856 1857#ifndef USE_SSSE3 1858 psrldq $13, %xmm3 1859 pslldq $3, %xmm2 1860 por %xmm3, %xmm2 /* merge into one 16byte value */ 1861#else 1862 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1863#endif 1864 TOLOWER (%xmm1, %xmm2) 1865 1866 pcmpeqb %xmm1, %xmm0 1867 pcmpeqb %xmm2, %xmm1 1868 psubb %xmm0, %xmm1 1869 pmovmskb %xmm1, %edx 1870 sub $0xffff, %edx 1871 jnz LABEL(exit) 1872 1873#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1874 sub $16, %r11 1875 jbe LABEL(strcmp_exitz) 1876#endif 1877 1878 add $16, %rcx 1879 movdqa %xmm4, %xmm3 1880 1881 add $16, %r10 1882 jg LABEL(nibble_ashr_13) /* cross page boundary */ 1883 1884 movdqa (%rsi, %rcx), %xmm1 1885 movdqa (%rdi, %rcx), %xmm2 1886 movdqa %xmm2, %xmm4 1887 1888#ifndef USE_SSSE3 1889 psrldq $13, %xmm3 1890 pslldq $3, %xmm2 1891 por %xmm3, %xmm2 /* merge into one 16byte value */ 1892#else 1893 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1894#endif 1895 TOLOWER (%xmm1, %xmm2) 1896 1897 pcmpeqb %xmm1, %xmm0 1898 pcmpeqb %xmm2, %xmm1 1899 psubb %xmm0, %xmm1 1900 pmovmskb %xmm1, %edx 1901 sub $0xffff, %edx 1902 jnz LABEL(exit) 1903 1904#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1905 sub $16, %r11 1906 jbe LABEL(strcmp_exitz) 1907#endif 1908 1909 add $16, %rcx 1910 movdqa %xmm4, %xmm3 1911 jmp LABEL(loop_ashr_13) 1912 1913 .p2align 4 1914LABEL(nibble_ashr_13): 1915 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1916 pmovmskb %xmm0, %edx 1917 test $0xe000, %edx 1918 jnz LABEL(ashr_13_exittail) 1919 1920#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1921 cmp $3, %r11 1922 jbe LABEL(ashr_13_exittail) 1923#endif 1924 1925 pxor %xmm0, %xmm0 1926 sub $0x1000, %r10 1927 jmp LABEL(gobble_ashr_13) 1928 1929 .p2align 4 1930LABEL(ashr_13_exittail): 1931 movdqa (%rsi, %rcx), %xmm1 1932 psrldq $13, %xmm0 1933 psrldq $13, %xmm3 1934 jmp LABEL(aftertail) 1935 1936/* 1937 * The following cases will be handled by ashr_14 1938 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1939 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 1940 */ 1941 .p2align 4 1942LABEL(ashr_14): 1943 pxor %xmm0, %xmm0 1944 movdqa (%rdi), %xmm2 1945 movdqa (%rsi), %xmm1 1946 pcmpeqb %xmm1, %xmm0 1947 pslldq $2, %xmm2 1948 TOLOWER (%xmm1, %xmm2) 1949 pcmpeqb %xmm1, %xmm2 1950 psubb %xmm0, %xmm2 1951 pmovmskb %xmm2, %r9d 1952 shr %cl, %edx 1953 shr %cl, %r9d 1954 sub %r9d, %edx 1955 jnz LABEL(less32bytes) 1956 movdqa (%rdi), %xmm3 1957 1958 UPDATE_STRNCMP_COUNTER 1959 1960 pxor %xmm0, %xmm0 1961 mov $16, %rcx /* index for loads */ 1962 mov $14, %r9d /* byte position left over from less32bytes case */ 1963 /* 1964 * Setup %r10 value allows us to detect crossing a page boundary. 1965 * When %r10 goes positive we have crossed a page boundary and 1966 * need to do a nibble. 1967 */ 1968 lea 14(%rdi), %r10 1969 and $0xfff, %r10 /* offset into 4K page */ 1970 sub $0x1000, %r10 /* subtract 4K pagesize */ 1971 1972 .p2align 4 1973LABEL(loop_ashr_14): 1974 add $16, %r10 1975 jg LABEL(nibble_ashr_14) 1976 1977LABEL(gobble_ashr_14): 1978 movdqa (%rsi, %rcx), %xmm1 1979 movdqa (%rdi, %rcx), %xmm2 1980 movdqa %xmm2, %xmm4 1981 1982#ifndef USE_SSSE3 1983 psrldq $14, %xmm3 1984 pslldq $2, %xmm2 1985 por %xmm3, %xmm2 /* merge into one 16byte value */ 1986#else 1987 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 1988#endif 1989 TOLOWER (%xmm1, %xmm2) 1990 1991 pcmpeqb %xmm1, %xmm0 1992 pcmpeqb %xmm2, %xmm1 1993 psubb %xmm0, %xmm1 1994 pmovmskb %xmm1, %edx 1995 sub $0xffff, %edx 1996 jnz LABEL(exit) 1997 1998#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1999 sub $16, %r11 2000 jbe LABEL(strcmp_exitz) 2001#endif 2002 2003 add $16, %rcx 2004 movdqa %xmm4, %xmm3 2005 2006 add $16, %r10 2007 jg LABEL(nibble_ashr_14) /* cross page boundary */ 2008 2009 movdqa (%rsi, %rcx), %xmm1 2010 movdqa (%rdi, %rcx), %xmm2 2011 movdqa %xmm2, %xmm4 2012 2013#ifndef USE_SSSE3 2014 psrldq $14, %xmm3 2015 pslldq $2, %xmm2 2016 por %xmm3, %xmm2 /* merge into one 16byte value */ 2017#else 2018 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 2019#endif 2020 TOLOWER (%xmm1, %xmm2) 2021 2022 pcmpeqb %xmm1, %xmm0 2023 pcmpeqb %xmm2, %xmm1 2024 psubb %xmm0, %xmm1 2025 pmovmskb %xmm1, %edx 2026 sub $0xffff, %edx 2027 jnz LABEL(exit) 2028 2029#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L 2030 sub $16, %r11 2031 jbe LABEL(strcmp_exitz) 2032#endif 2033 2034 add $16, %rcx 2035 movdqa %xmm4, %xmm3 2036 jmp LABEL(loop_ashr_14) 2037 2038 .p2align 4 2039LABEL(nibble_ashr_14): 2040 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 2041 pmovmskb %xmm0, %edx 2042 test $0xc000, %edx 2043 jnz LABEL(ashr_14_exittail) 2044 2045#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2046 cmp $2, %r11 2047 jbe LABEL(ashr_14_exittail) 2048#endif 2049 2050 pxor %xmm0, %xmm0 2051 sub $0x1000, %r10 2052 jmp LABEL(gobble_ashr_14) 2053 2054 .p2align 4 2055LABEL(ashr_14_exittail): 2056 movdqa (%rsi, %rcx), %xmm1 2057 psrldq $14, %xmm0 2058 psrldq $14, %xmm3 2059 jmp LABEL(aftertail) 2060 2061/* 2062 * The following cases will be handled by ashr_15 2063 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 2064 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 2065 */ 2066 .p2align 4 2067LABEL(ashr_15): 2068 pxor %xmm0, %xmm0 2069 movdqa (%rdi), %xmm2 2070 movdqa (%rsi), %xmm1 2071 pcmpeqb %xmm1, %xmm0 2072 pslldq $1, %xmm2 2073 TOLOWER (%xmm1, %xmm2) 2074 pcmpeqb %xmm1, %xmm2 2075 psubb %xmm0, %xmm2 2076 pmovmskb %xmm2, %r9d 2077 shr %cl, %edx 2078 shr %cl, %r9d 2079 sub %r9d, %edx 2080 jnz LABEL(less32bytes) 2081 2082 movdqa (%rdi), %xmm3 2083 2084 UPDATE_STRNCMP_COUNTER 2085 2086 pxor %xmm0, %xmm0 2087 mov $16, %rcx /* index for loads */ 2088 mov $15, %r9d /* byte position left over from less32bytes case */ 2089 /* 2090 * Setup %r10 value allows us to detect crossing a page boundary. 2091 * When %r10 goes positive we have crossed a page boundary and 2092 * need to do a nibble. 2093 */ 2094 lea 15(%rdi), %r10 2095 and $0xfff, %r10 /* offset into 4K page */ 2096 2097 sub $0x1000, %r10 /* subtract 4K pagesize */ 2098 2099 .p2align 4 2100LABEL(loop_ashr_15): 2101 add $16, %r10 2102 jg LABEL(nibble_ashr_15) 2103 2104LABEL(gobble_ashr_15): 2105 movdqa (%rsi, %rcx), %xmm1 2106 movdqa (%rdi, %rcx), %xmm2 2107 movdqa %xmm2, %xmm4 2108 2109#ifndef USE_SSSE3 2110 psrldq $15, %xmm3 2111 pslldq $1, %xmm2 2112 por %xmm3, %xmm2 /* merge into one 16byte value */ 2113#else 2114 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 2115#endif 2116 TOLOWER (%xmm1, %xmm2) 2117 2118 pcmpeqb %xmm1, %xmm0 2119 pcmpeqb %xmm2, %xmm1 2120 psubb %xmm0, %xmm1 2121 pmovmskb %xmm1, %edx 2122 sub $0xffff, %edx 2123 jnz LABEL(exit) 2124 2125#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2126 sub $16, %r11 2127 jbe LABEL(strcmp_exitz) 2128#endif 2129 2130 add $16, %rcx 2131 movdqa %xmm4, %xmm3 2132 2133 add $16, %r10 2134 jg LABEL(nibble_ashr_15) /* cross page boundary */ 2135 2136 movdqa (%rsi, %rcx), %xmm1 2137 movdqa (%rdi, %rcx), %xmm2 2138 movdqa %xmm2, %xmm4 2139 2140#ifndef USE_SSSE3 2141 psrldq $15, %xmm3 2142 pslldq $1, %xmm2 2143 por %xmm3, %xmm2 /* merge into one 16byte value */ 2144#else 2145 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 2146#endif 2147 TOLOWER (%xmm1, %xmm2) 2148 2149 pcmpeqb %xmm1, %xmm0 2150 pcmpeqb %xmm2, %xmm1 2151 psubb %xmm0, %xmm1 2152 pmovmskb %xmm1, %edx 2153 sub $0xffff, %edx 2154 jnz LABEL(exit) 2155 2156#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2157 sub $16, %r11 2158 jbe LABEL(strcmp_exitz) 2159#endif 2160 2161 add $16, %rcx 2162 movdqa %xmm4, %xmm3 2163 jmp LABEL(loop_ashr_15) 2164 2165 .p2align 4 2166LABEL(nibble_ashr_15): 2167 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 2168 pmovmskb %xmm0, %edx 2169 test $0x8000, %edx 2170 jnz LABEL(ashr_15_exittail) 2171 2172#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2173 cmpq $1, %r11 2174 jbe LABEL(ashr_15_exittail) 2175#endif 2176 2177 pxor %xmm0, %xmm0 2178 sub $0x1000, %r10 2179 jmp LABEL(gobble_ashr_15) 2180 2181 .p2align 4 2182LABEL(ashr_15_exittail): 2183 movdqa (%rsi, %rcx), %xmm1 2184 psrldq $15, %xmm3 2185 psrldq $15, %xmm0 2186 2187 .p2align 4 2188LABEL(aftertail): 2189 TOLOWER (%xmm1, %xmm3) 2190 pcmpeqb %xmm3, %xmm1 2191 psubb %xmm0, %xmm1 2192 pmovmskb %xmm1, %edx 2193 not %edx 2194 2195 .p2align 4 2196LABEL(exit): 2197 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 2198LABEL(less32bytes): 2199 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 2200 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 2201 test %r8d, %r8d 2202 jz LABEL(ret) 2203 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 2204 2205 .p2align 4 2206LABEL(ret): 2207LABEL(less16bytes): 2208 bsf %rdx, %rdx /* find and store bit index in %rdx */ 2209 2210#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 2211 sub %rdx, %r11 2212 jbe LABEL(strcmp_exitz) 2213#endif 2214 movzbl (%rsi, %rdx), %ecx 2215 movzbl (%rdi, %rdx), %eax 2216 2217#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 2218 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 2219 movl (%rdx,%rcx,4), %ecx 2220 movl (%rdx,%rax,4), %eax 2221#endif 2222 2223 sub %ecx, %eax 2224 ret 2225 2226LABEL(strcmp_exitz): 2227 xor %eax, %eax 2228 ret 2229 2230 .p2align 4 2231LABEL(Byte0): 2232 movzbl (%rsi), %ecx 2233 movzbl (%rdi), %eax 2234 2235#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 2236 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 2237 movl (%rdx,%rcx,4), %ecx 2238 movl (%rdx,%rax,4), %eax 2239#endif 2240 2241 sub %ecx, %eax 2242 ret 2243END (STRCMP) 2244 2245 .section .rodata,"a",@progbits 2246 .p2align 3 2247LABEL(unaligned_table): 2248 .int LABEL(ashr_1) - LABEL(unaligned_table) 2249 .int LABEL(ashr_2) - LABEL(unaligned_table) 2250 .int LABEL(ashr_3) - LABEL(unaligned_table) 2251 .int LABEL(ashr_4) - LABEL(unaligned_table) 2252 .int LABEL(ashr_5) - LABEL(unaligned_table) 2253 .int LABEL(ashr_6) - LABEL(unaligned_table) 2254 .int LABEL(ashr_7) - LABEL(unaligned_table) 2255 .int LABEL(ashr_8) - LABEL(unaligned_table) 2256 .int LABEL(ashr_9) - LABEL(unaligned_table) 2257 .int LABEL(ashr_10) - LABEL(unaligned_table) 2258 .int LABEL(ashr_11) - LABEL(unaligned_table) 2259 .int LABEL(ashr_12) - LABEL(unaligned_table) 2260 .int LABEL(ashr_13) - LABEL(unaligned_table) 2261 .int LABEL(ashr_14) - LABEL(unaligned_table) 2262 .int LABEL(ashr_15) - LABEL(unaligned_table) 2263 .int LABEL(ashr_0) - LABEL(unaligned_table) 2264libc_hidden_builtin_def (STRCMP) 2265