1/* strcmp with SSE4.2 2 Copyright (C) 2009-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef STRCMP_SSE42 22# define STRCMP_SSE42 __strcmp_sse42 23#endif 24 25#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 26# include "locale-defines.h" 27#endif 28 29#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 30/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz 31 if the new counter > the old one or is 0. */ 32# define UPDATE_STRNCMP_COUNTER \ 33 /* calculate left number to compare */ \ 34 lea -16(%rcx, %r11), %r9; \ 35 cmp %r9, %r11; \ 36 jb LABEL(strcmp_exitz); \ 37 test %r9, %r9; \ 38 je LABEL(strcmp_exitz); \ 39 mov %r9, %r11 40#else 41# define UPDATE_STRNCMP_COUNTER 42#endif 43 44#ifdef USE_AVX 45# define SECTION avx 46# define GLABEL(l) l##_avx 47#else 48# define SECTION sse4.2 49# define GLABEL(l) l##_sse42 50#endif 51 52#define LABEL(l) .L##l 53 54/* We use 0x1a: 55 _SIDD_SBYTE_OPS 56 | _SIDD_CMP_EQUAL_EACH 57 | _SIDD_NEGATIVE_POLARITY 58 | _SIDD_LEAST_SIGNIFICANT 59 on pcmpistri to find out if two 16byte data elements are the same 60 and the offset of the first different byte. There are 4 cases: 61 62 1. Both 16byte data elements are valid and identical. 63 2. Both 16byte data elements have EOS and identical. 64 3. Both 16byte data elements are valid and they differ at offset X. 65 4. At least one 16byte data element has EOS at offset X. Two 16byte 66 data elements must differ at or before offset X. 67 68 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: 69 70 case ECX CFlag ZFlag SFlag 71 1 16 0 0 0 72 2 16 0 1 1 73 3 X 1 0 0 74 4 0 <= X 1 0/1 0/1 75 76 We exit from the loop for cases 2, 3 and 4 with jbe which branches 77 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for 78 case 2. */ 79 80 /* Put all SSE 4.2 functions together. */ 81 .section .text.SECTION,"ax",@progbits 82 .align 16 83 .type STRCMP_SSE42, @function 84 .globl STRCMP_SSE42 85 .hidden STRCMP_SSE42 86#ifdef USE_AS_STRCASECMP_L 87ENTRY (GLABEL(__strcasecmp)) 88 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 89 mov %fs:(%rax),%RDX_LP 90 91 // XXX 5 byte should be before the function 92 /* 5-byte NOP. */ 93 .byte 0x0f,0x1f,0x44,0x00,0x00 94END (GLABEL(__strcasecmp)) 95 /* FALLTHROUGH to strcasecmp_l. */ 96#endif 97#ifdef USE_AS_STRNCASECMP_L 98ENTRY (GLABEL(__strncasecmp)) 99 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax 100 mov %fs:(%rax),%RCX_LP 101 102 // XXX 5 byte should be before the function 103 /* 5-byte NOP. */ 104 .byte 0x0f,0x1f,0x44,0x00,0x00 105END (GLABEL(__strncasecmp)) 106 /* FALLTHROUGH to strncasecmp_l. */ 107#endif 108 109 110#ifdef USE_AVX 111# define movdqa vmovdqa 112# define movdqu vmovdqu 113# define pmovmskb vpmovmskb 114# define pcmpistri vpcmpistri 115# define psubb vpsubb 116# define pcmpeqb vpcmpeqb 117# define psrldq vpsrldq 118# define pslldq vpslldq 119# define palignr vpalignr 120# define pxor vpxor 121# define D(arg) arg, arg 122#else 123# define D(arg) arg 124#endif 125 126STRCMP_SSE42: 127 cfi_startproc 128 _CET_ENDBR 129 CALL_MCOUNT 130 131/* 132 * This implementation uses SSE to compare up to 16 bytes at a time. 133 */ 134#ifdef USE_AS_STRCASECMP_L 135 /* We have to fall back on the C implementation for locales 136 with encodings not matching ASCII for single bytes. */ 137# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 138 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP 139# else 140 mov (%rdx), %RAX_LP 141# endif 142 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 143 jne __strcasecmp_l_nonascii 144#endif 145#ifdef USE_AS_STRNCASECMP_L 146 /* We have to fall back on the C implementation for locales 147 with encodings not matching ASCII for single bytes. */ 148# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 149 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP 150# else 151 mov (%rcx), %RAX_LP 152# endif 153 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) 154 jne __strncasecmp_l_nonascii 155#endif 156 157#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 158 test %RDX_LP, %RDX_LP 159 je LABEL(strcmp_exitz) 160 cmp $1, %RDX_LP 161 je LABEL(Byte0) 162 mov %RDX_LP, %R11_LP 163#endif 164 mov %esi, %ecx 165 mov %edi, %eax 166/* Use 64bit AND here to avoid long NOP padding. */ 167 and $0x3f, %rcx /* rsi alignment in cache line */ 168 and $0x3f, %rax /* rdi alignment in cache line */ 169#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 170 .section .rodata.cst16,"aM",@progbits,16 171 .align 16 172LABEL(belowupper): 173 .quad 0x4040404040404040 174 .quad 0x4040404040404040 175LABEL(topupper): 176# ifdef USE_AVX 177 .quad 0x5a5a5a5a5a5a5a5a 178 .quad 0x5a5a5a5a5a5a5a5a 179# else 180 .quad 0x5b5b5b5b5b5b5b5b 181 .quad 0x5b5b5b5b5b5b5b5b 182# endif 183LABEL(touppermask): 184 .quad 0x2020202020202020 185 .quad 0x2020202020202020 186 .previous 187 movdqa LABEL(belowupper)(%rip), %xmm4 188# define UCLOW_reg %xmm4 189 movdqa LABEL(topupper)(%rip), %xmm5 190# define UCHIGH_reg %xmm5 191 movdqa LABEL(touppermask)(%rip), %xmm6 192# define LCQWORD_reg %xmm6 193#endif 194 cmp $0x30, %ecx 195 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ 196 cmp $0x30, %eax 197 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ 198 movdqu (%rdi), %xmm1 199 movdqu (%rsi), %xmm2 200#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 201# ifdef USE_AVX 202# define TOLOWER(reg1, reg2) \ 203 vpcmpgtb UCLOW_reg, reg1, %xmm7; \ 204 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ 205 vpcmpgtb UCLOW_reg, reg2, %xmm9; \ 206 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ 207 vpandn %xmm7, %xmm8, %xmm8; \ 208 vpandn %xmm9, %xmm10, %xmm10; \ 209 vpand LCQWORD_reg, %xmm8, %xmm8; \ 210 vpand LCQWORD_reg, %xmm10, %xmm10; \ 211 vpor reg1, %xmm8, reg1; \ 212 vpor reg2, %xmm10, reg2 213# else 214# define TOLOWER(reg1, reg2) \ 215 movdqa reg1, %xmm7; \ 216 movdqa UCHIGH_reg, %xmm8; \ 217 movdqa reg2, %xmm9; \ 218 movdqa UCHIGH_reg, %xmm10; \ 219 pcmpgtb UCLOW_reg, %xmm7; \ 220 pcmpgtb reg1, %xmm8; \ 221 pcmpgtb UCLOW_reg, %xmm9; \ 222 pcmpgtb reg2, %xmm10; \ 223 pand %xmm8, %xmm7; \ 224 pand %xmm10, %xmm9; \ 225 pand LCQWORD_reg, %xmm7; \ 226 pand LCQWORD_reg, %xmm9; \ 227 por %xmm7, reg1; \ 228 por %xmm9, reg2 229# endif 230 TOLOWER (%xmm1, %xmm2) 231#else 232# define TOLOWER(reg1, reg2) 233#endif 234 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ 235 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ 236 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ 237 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ 238 pmovmskb %xmm1, %edx 239 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 240 jnz LABEL(less16bytes)/* If not, find different value or null char */ 241#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 242 sub $16, %r11 243 jbe LABEL(strcmp_exitz)/* finish comparison */ 244#endif 245 add $16, %rsi /* prepare to search next 16 bytes */ 246 add $16, %rdi /* prepare to search next 16 bytes */ 247 248 /* 249 * Determine source and destination string offsets from 16-byte 250 * alignment. Use relative offset difference between the two to 251 * determine which case below to use. 252 */ 253 .p2align 4 254LABEL(crosscache): 255 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ 256 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ 257 mov $0xffff, %edx /* for equivalent offset */ 258 xor %r8d, %r8d 259 and $0xf, %ecx /* offset of rsi */ 260 and $0xf, %eax /* offset of rdi */ 261 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ 262 cmp %eax, %ecx 263 je LABEL(ashr_0) /* rsi and rdi relative offset same */ 264 ja LABEL(bigger) 265 mov %edx, %r8d /* r8d is offset flag for exit tail */ 266 xchg %ecx, %eax 267 xchg %rsi, %rdi 268LABEL(bigger): 269 movdqa (%rdi), %xmm2 270 movdqa (%rsi), %xmm1 271 lea 15(%rax), %r9 272 sub %rcx, %r9 273 lea LABEL(unaligned_table)(%rip), %r10 274 movslq (%r10, %r9,4), %r9 275 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ 276 lea (%r10, %r9), %r10 277 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ 278 279/* 280 * The following cases will be handled by ashr_0 281 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 282 * n(0~15) n(0~15) 15(15+ n-n) ashr_0 283 */ 284 .p2align 4 285LABEL(ashr_0): 286 287 movdqa (%rsi), %xmm1 288 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ 289#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 290 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ 291#else 292 movdqa (%rdi), %xmm2 293 TOLOWER (%xmm1, %xmm2) 294 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ 295#endif 296 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ 297 pmovmskb %xmm1, %r9d 298 shr %cl, %edx /* adjust 0xffff for offset */ 299 shr %cl, %r9d /* adjust for 16-byte offset */ 300 sub %r9d, %edx 301 /* 302 * edx must be the same with r9d if in left byte (16-rcx) is equal to 303 * the start from (16-rax) and no null char was seen. 304 */ 305 jne LABEL(less32bytes) /* mismatch or null char */ 306 UPDATE_STRNCMP_COUNTER 307 mov $16, %rcx 308 mov $16, %r9 309 310 /* 311 * Now both strings are aligned at 16-byte boundary. Loop over strings 312 * checking 32-bytes per iteration. 313 */ 314 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 315 .p2align 4 316LABEL(ashr_0_use): 317 movdqa (%rdi,%rdx), %xmm0 318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 319 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 320#else 321 movdqa (%rsi,%rdx), %xmm1 322 TOLOWER (%xmm0, %xmm1) 323 pcmpistri $0x1a, %xmm1, %xmm0 324#endif 325 lea 16(%rdx), %rdx 326 jbe LABEL(ashr_0_exit_use) 327#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 328 sub $16, %r11 329 jbe LABEL(strcmp_exitz) 330#endif 331 332 movdqa (%rdi,%rdx), %xmm0 333#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 334 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 335#else 336 movdqa (%rsi,%rdx), %xmm1 337 TOLOWER (%xmm0, %xmm1) 338 pcmpistri $0x1a, %xmm1, %xmm0 339#endif 340 lea 16(%rdx), %rdx 341 jbe LABEL(ashr_0_exit_use) 342#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 343 sub $16, %r11 344 jbe LABEL(strcmp_exitz) 345#endif 346 jmp LABEL(ashr_0_use) 347 348 349 .p2align 4 350LABEL(ashr_0_exit_use): 351 jnc LABEL(strcmp_exitz) 352#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 353 sub %rcx, %r11 354 jbe LABEL(strcmp_exitz) 355#endif 356 lea -16(%rdx, %rcx), %rcx 357 movzbl (%rdi, %rcx), %eax 358 movzbl (%rsi, %rcx), %edx 359#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 360 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx 361 movl (%rcx,%rax,4), %eax 362 movl (%rcx,%rdx,4), %edx 363#endif 364 sub %edx, %eax 365 ret 366 367 368 369/* 370 * The following cases will be handled by ashr_1 371 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 372 * n(15) n -15 0(15 +(n-15) - n) ashr_1 373 */ 374 .p2align 4 375LABEL(ashr_1): 376 pslldq $15, D(%xmm2) /* shift first string to align with second */ 377 TOLOWER (%xmm1, %xmm2) 378 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ 379 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ 380 pmovmskb %xmm2, %r9d 381 shr %cl, %edx /* adjust 0xffff for offset */ 382 shr %cl, %r9d /* adjust for 16-byte offset */ 383 sub %r9d, %edx 384 jnz LABEL(less32bytes) /* mismatch or null char seen */ 385 movdqa (%rdi), %xmm3 386 UPDATE_STRNCMP_COUNTER 387 388 mov $16, %rcx /* index for loads*/ 389 mov $1, %r9d /* byte position left over from less32bytes case */ 390 /* 391 * Setup %r10 value allows us to detect crossing a page boundary. 392 * When %r10 goes positive we have crossed a page boundary and 393 * need to do a nibble. 394 */ 395 lea 1(%rdi), %r10 396 and $0xfff, %r10 /* offset into 4K page */ 397 sub $0x1000, %r10 /* subtract 4K pagesize */ 398 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 399 400 .p2align 4 401LABEL(loop_ashr_1_use): 402 add $16, %r10 403 jg LABEL(nibble_ashr_1_use) 404 405LABEL(nibble_ashr_1_restart_use): 406 movdqa (%rdi, %rdx), %xmm0 407 palignr $1, -16(%rdi, %rdx), D(%xmm0) 408#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 409 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 410#else 411 movdqa (%rsi,%rdx), %xmm1 412 TOLOWER (%xmm0, %xmm1) 413 pcmpistri $0x1a, %xmm1, %xmm0 414#endif 415 jbe LABEL(exit_use) 416#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 417 sub $16, %r11 418 jbe LABEL(strcmp_exitz) 419#endif 420 421 add $16, %rdx 422 add $16, %r10 423 jg LABEL(nibble_ashr_1_use) 424 425 movdqa (%rdi, %rdx), %xmm0 426 palignr $1, -16(%rdi, %rdx), D(%xmm0) 427#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 428 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 429#else 430 movdqa (%rsi,%rdx), %xmm1 431 TOLOWER (%xmm0, %xmm1) 432 pcmpistri $0x1a, %xmm1, %xmm0 433#endif 434 jbe LABEL(exit_use) 435#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 436 sub $16, %r11 437 jbe LABEL(strcmp_exitz) 438#endif 439 add $16, %rdx 440 jmp LABEL(loop_ashr_1_use) 441 442 .p2align 4 443LABEL(nibble_ashr_1_use): 444 sub $0x1000, %r10 445 movdqa -16(%rdi, %rdx), %xmm0 446 psrldq $1, D(%xmm0) 447 pcmpistri $0x3a,%xmm0, %xmm0 448#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 449 cmp %r11, %rcx 450 jae LABEL(nibble_ashr_exit_use) 451#endif 452 cmp $14, %ecx 453 ja LABEL(nibble_ashr_1_restart_use) 454 455 jmp LABEL(nibble_ashr_exit_use) 456 457/* 458 * The following cases will be handled by ashr_2 459 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 460 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 461 */ 462 .p2align 4 463LABEL(ashr_2): 464 pslldq $14, D(%xmm2) 465 TOLOWER (%xmm1, %xmm2) 466 pcmpeqb %xmm1, D(%xmm2) 467 psubb %xmm0, D(%xmm2) 468 pmovmskb %xmm2, %r9d 469 shr %cl, %edx 470 shr %cl, %r9d 471 sub %r9d, %edx 472 jnz LABEL(less32bytes) 473 movdqa (%rdi), %xmm3 474 UPDATE_STRNCMP_COUNTER 475 476 mov $16, %rcx /* index for loads */ 477 mov $2, %r9d /* byte position left over from less32bytes case */ 478 /* 479 * Setup %r10 value allows us to detect crossing a page boundary. 480 * When %r10 goes positive we have crossed a page boundary and 481 * need to do a nibble. 482 */ 483 lea 2(%rdi), %r10 484 and $0xfff, %r10 /* offset into 4K page */ 485 sub $0x1000, %r10 /* subtract 4K pagesize */ 486 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 487 488 .p2align 4 489LABEL(loop_ashr_2_use): 490 add $16, %r10 491 jg LABEL(nibble_ashr_2_use) 492 493LABEL(nibble_ashr_2_restart_use): 494 movdqa (%rdi, %rdx), %xmm0 495 palignr $2, -16(%rdi, %rdx), D(%xmm0) 496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 497 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 498#else 499 movdqa (%rsi,%rdx), %xmm1 500 TOLOWER (%xmm0, %xmm1) 501 pcmpistri $0x1a, %xmm1, %xmm0 502#endif 503 jbe LABEL(exit_use) 504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 505 sub $16, %r11 506 jbe LABEL(strcmp_exitz) 507#endif 508 509 add $16, %rdx 510 add $16, %r10 511 jg LABEL(nibble_ashr_2_use) 512 513 movdqa (%rdi, %rdx), %xmm0 514 palignr $2, -16(%rdi, %rdx), D(%xmm0) 515#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 516 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 517#else 518 movdqa (%rsi,%rdx), %xmm1 519 TOLOWER (%xmm0, %xmm1) 520 pcmpistri $0x1a, %xmm1, %xmm0 521#endif 522 jbe LABEL(exit_use) 523#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 524 sub $16, %r11 525 jbe LABEL(strcmp_exitz) 526#endif 527 add $16, %rdx 528 jmp LABEL(loop_ashr_2_use) 529 530 .p2align 4 531LABEL(nibble_ashr_2_use): 532 sub $0x1000, %r10 533 movdqa -16(%rdi, %rdx), %xmm0 534 psrldq $2, D(%xmm0) 535 pcmpistri $0x3a,%xmm0, %xmm0 536#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 537 cmp %r11, %rcx 538 jae LABEL(nibble_ashr_exit_use) 539#endif 540 cmp $13, %ecx 541 ja LABEL(nibble_ashr_2_restart_use) 542 543 jmp LABEL(nibble_ashr_exit_use) 544 545/* 546 * The following cases will be handled by ashr_3 547 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 548 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 549 */ 550 .p2align 4 551LABEL(ashr_3): 552 pslldq $13, D(%xmm2) 553 TOLOWER (%xmm1, %xmm2) 554 pcmpeqb %xmm1, D(%xmm2) 555 psubb %xmm0, D(%xmm2) 556 pmovmskb %xmm2, %r9d 557 shr %cl, %edx 558 shr %cl, %r9d 559 sub %r9d, %edx 560 jnz LABEL(less32bytes) 561 movdqa (%rdi), %xmm3 562 563 UPDATE_STRNCMP_COUNTER 564 565 mov $16, %rcx /* index for loads */ 566 mov $3, %r9d /* byte position left over from less32bytes case */ 567 /* 568 * Setup %r10 value allows us to detect crossing a page boundary. 569 * When %r10 goes positive we have crossed a page boundary and 570 * need to do a nibble. 571 */ 572 lea 3(%rdi), %r10 573 and $0xfff, %r10 /* offset into 4K page */ 574 sub $0x1000, %r10 /* subtract 4K pagesize */ 575 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 576 577LABEL(loop_ashr_3_use): 578 add $16, %r10 579 jg LABEL(nibble_ashr_3_use) 580 581LABEL(nibble_ashr_3_restart_use): 582 movdqa (%rdi, %rdx), %xmm0 583 palignr $3, -16(%rdi, %rdx), D(%xmm0) 584#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 585 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 586#else 587 movdqa (%rsi,%rdx), %xmm1 588 TOLOWER (%xmm0, %xmm1) 589 pcmpistri $0x1a, %xmm1, %xmm0 590#endif 591 jbe LABEL(exit_use) 592#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 593 sub $16, %r11 594 jbe LABEL(strcmp_exitz) 595#endif 596 597 add $16, %rdx 598 add $16, %r10 599 jg LABEL(nibble_ashr_3_use) 600 601 movdqa (%rdi, %rdx), %xmm0 602 palignr $3, -16(%rdi, %rdx), D(%xmm0) 603#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 604 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 605#else 606 movdqa (%rsi,%rdx), %xmm1 607 TOLOWER (%xmm0, %xmm1) 608 pcmpistri $0x1a, %xmm1, %xmm0 609#endif 610 jbe LABEL(exit_use) 611#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 612 sub $16, %r11 613 jbe LABEL(strcmp_exitz) 614#endif 615 add $16, %rdx 616 jmp LABEL(loop_ashr_3_use) 617 618 .p2align 4 619LABEL(nibble_ashr_3_use): 620 sub $0x1000, %r10 621 movdqa -16(%rdi, %rdx), %xmm0 622 psrldq $3, D(%xmm0) 623 pcmpistri $0x3a,%xmm0, %xmm0 624#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 625 cmp %r11, %rcx 626 jae LABEL(nibble_ashr_exit_use) 627#endif 628 cmp $12, %ecx 629 ja LABEL(nibble_ashr_3_restart_use) 630 631 jmp LABEL(nibble_ashr_exit_use) 632 633/* 634 * The following cases will be handled by ashr_4 635 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 636 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 637 */ 638 .p2align 4 639LABEL(ashr_4): 640 pslldq $12, D(%xmm2) 641 TOLOWER (%xmm1, %xmm2) 642 pcmpeqb %xmm1, D(%xmm2) 643 psubb %xmm0, D(%xmm2) 644 pmovmskb %xmm2, %r9d 645 shr %cl, %edx 646 shr %cl, %r9d 647 sub %r9d, %edx 648 jnz LABEL(less32bytes) 649 movdqa (%rdi), %xmm3 650 651 UPDATE_STRNCMP_COUNTER 652 653 mov $16, %rcx /* index for loads */ 654 mov $4, %r9d /* byte position left over from less32bytes case */ 655 /* 656 * Setup %r10 value allows us to detect crossing a page boundary. 657 * When %r10 goes positive we have crossed a page boundary and 658 * need to do a nibble. 659 */ 660 lea 4(%rdi), %r10 661 and $0xfff, %r10 /* offset into 4K page */ 662 sub $0x1000, %r10 /* subtract 4K pagesize */ 663 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 664 665 .p2align 4 666LABEL(loop_ashr_4_use): 667 add $16, %r10 668 jg LABEL(nibble_ashr_4_use) 669 670LABEL(nibble_ashr_4_restart_use): 671 movdqa (%rdi, %rdx), %xmm0 672 palignr $4, -16(%rdi, %rdx), D(%xmm0) 673#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 674 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 675#else 676 movdqa (%rsi,%rdx), %xmm1 677 TOLOWER (%xmm0, %xmm1) 678 pcmpistri $0x1a, %xmm1, %xmm0 679#endif 680 jbe LABEL(exit_use) 681#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 682 sub $16, %r11 683 jbe LABEL(strcmp_exitz) 684#endif 685 686 add $16, %rdx 687 add $16, %r10 688 jg LABEL(nibble_ashr_4_use) 689 690 movdqa (%rdi, %rdx), %xmm0 691 palignr $4, -16(%rdi, %rdx), D(%xmm0) 692#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 693 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 694#else 695 movdqa (%rsi,%rdx), %xmm1 696 TOLOWER (%xmm0, %xmm1) 697 pcmpistri $0x1a, %xmm1, %xmm0 698#endif 699 jbe LABEL(exit_use) 700#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 701 sub $16, %r11 702 jbe LABEL(strcmp_exitz) 703#endif 704 add $16, %rdx 705 jmp LABEL(loop_ashr_4_use) 706 707 .p2align 4 708LABEL(nibble_ashr_4_use): 709 sub $0x1000, %r10 710 movdqa -16(%rdi, %rdx), %xmm0 711 psrldq $4, D(%xmm0) 712 pcmpistri $0x3a,%xmm0, %xmm0 713#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 714 cmp %r11, %rcx 715 jae LABEL(nibble_ashr_exit_use) 716#endif 717 cmp $11, %ecx 718 ja LABEL(nibble_ashr_4_restart_use) 719 720 jmp LABEL(nibble_ashr_exit_use) 721 722/* 723 * The following cases will be handled by ashr_5 724 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 725 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 726 */ 727 .p2align 4 728LABEL(ashr_5): 729 pslldq $11, D(%xmm2) 730 TOLOWER (%xmm1, %xmm2) 731 pcmpeqb %xmm1, D(%xmm2) 732 psubb %xmm0, D(%xmm2) 733 pmovmskb %xmm2, %r9d 734 shr %cl, %edx 735 shr %cl, %r9d 736 sub %r9d, %edx 737 jnz LABEL(less32bytes) 738 movdqa (%rdi), %xmm3 739 740 UPDATE_STRNCMP_COUNTER 741 742 mov $16, %rcx /* index for loads */ 743 mov $5, %r9d /* byte position left over from less32bytes case */ 744 /* 745 * Setup %r10 value allows us to detect crossing a page boundary. 746 * When %r10 goes positive we have crossed a page boundary and 747 * need to do a nibble. 748 */ 749 lea 5(%rdi), %r10 750 and $0xfff, %r10 /* offset into 4K page */ 751 sub $0x1000, %r10 /* subtract 4K pagesize */ 752 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 753 754 .p2align 4 755LABEL(loop_ashr_5_use): 756 add $16, %r10 757 jg LABEL(nibble_ashr_5_use) 758 759LABEL(nibble_ashr_5_restart_use): 760 movdqa (%rdi, %rdx), %xmm0 761 palignr $5, -16(%rdi, %rdx), D(%xmm0) 762#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 763 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 764#else 765 movdqa (%rsi,%rdx), %xmm1 766 TOLOWER (%xmm0, %xmm1) 767 pcmpistri $0x1a, %xmm1, %xmm0 768#endif 769 jbe LABEL(exit_use) 770#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 771 sub $16, %r11 772 jbe LABEL(strcmp_exitz) 773#endif 774 775 add $16, %rdx 776 add $16, %r10 777 jg LABEL(nibble_ashr_5_use) 778 779 movdqa (%rdi, %rdx), %xmm0 780 781 palignr $5, -16(%rdi, %rdx), D(%xmm0) 782#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 783 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 784#else 785 movdqa (%rsi,%rdx), %xmm1 786 TOLOWER (%xmm0, %xmm1) 787 pcmpistri $0x1a, %xmm1, %xmm0 788#endif 789 jbe LABEL(exit_use) 790#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 791 sub $16, %r11 792 jbe LABEL(strcmp_exitz) 793#endif 794 add $16, %rdx 795 jmp LABEL(loop_ashr_5_use) 796 797 .p2align 4 798LABEL(nibble_ashr_5_use): 799 sub $0x1000, %r10 800 movdqa -16(%rdi, %rdx), %xmm0 801 psrldq $5, D(%xmm0) 802 pcmpistri $0x3a,%xmm0, %xmm0 803#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 804 cmp %r11, %rcx 805 jae LABEL(nibble_ashr_exit_use) 806#endif 807 cmp $10, %ecx 808 ja LABEL(nibble_ashr_5_restart_use) 809 810 jmp LABEL(nibble_ashr_exit_use) 811 812/* 813 * The following cases will be handled by ashr_6 814 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 815 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 816 */ 817 .p2align 4 818LABEL(ashr_6): 819 pslldq $10, D(%xmm2) 820 TOLOWER (%xmm1, %xmm2) 821 pcmpeqb %xmm1, D(%xmm2) 822 psubb %xmm0, D(%xmm2) 823 pmovmskb %xmm2, %r9d 824 shr %cl, %edx 825 shr %cl, %r9d 826 sub %r9d, %edx 827 jnz LABEL(less32bytes) 828 movdqa (%rdi), %xmm3 829 830 UPDATE_STRNCMP_COUNTER 831 832 mov $16, %rcx /* index for loads */ 833 mov $6, %r9d /* byte position left over from less32bytes case */ 834 /* 835 * Setup %r10 value allows us to detect crossing a page boundary. 836 * When %r10 goes positive we have crossed a page boundary and 837 * need to do a nibble. 838 */ 839 lea 6(%rdi), %r10 840 and $0xfff, %r10 /* offset into 4K page */ 841 sub $0x1000, %r10 /* subtract 4K pagesize */ 842 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 843 844 .p2align 4 845LABEL(loop_ashr_6_use): 846 add $16, %r10 847 jg LABEL(nibble_ashr_6_use) 848 849LABEL(nibble_ashr_6_restart_use): 850 movdqa (%rdi, %rdx), %xmm0 851 palignr $6, -16(%rdi, %rdx), D(%xmm0) 852#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 853 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 854#else 855 movdqa (%rsi,%rdx), %xmm1 856 TOLOWER (%xmm0, %xmm1) 857 pcmpistri $0x1a, %xmm1, %xmm0 858#endif 859 jbe LABEL(exit_use) 860#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 861 sub $16, %r11 862 jbe LABEL(strcmp_exitz) 863#endif 864 865 add $16, %rdx 866 add $16, %r10 867 jg LABEL(nibble_ashr_6_use) 868 869 movdqa (%rdi, %rdx), %xmm0 870 palignr $6, -16(%rdi, %rdx), D(%xmm0) 871#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 872 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 873#else 874 movdqa (%rsi,%rdx), %xmm1 875 TOLOWER (%xmm0, %xmm1) 876 pcmpistri $0x1a, %xmm1, %xmm0 877#endif 878 jbe LABEL(exit_use) 879#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 880 sub $16, %r11 881 jbe LABEL(strcmp_exitz) 882#endif 883 add $16, %rdx 884 jmp LABEL(loop_ashr_6_use) 885 886 .p2align 4 887LABEL(nibble_ashr_6_use): 888 sub $0x1000, %r10 889 movdqa -16(%rdi, %rdx), %xmm0 890 psrldq $6, D(%xmm0) 891 pcmpistri $0x3a,%xmm0, %xmm0 892#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 893 cmp %r11, %rcx 894 jae LABEL(nibble_ashr_exit_use) 895#endif 896 cmp $9, %ecx 897 ja LABEL(nibble_ashr_6_restart_use) 898 899 jmp LABEL(nibble_ashr_exit_use) 900 901/* 902 * The following cases will be handled by ashr_7 903 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 904 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 905 */ 906 .p2align 4 907LABEL(ashr_7): 908 pslldq $9, D(%xmm2) 909 TOLOWER (%xmm1, %xmm2) 910 pcmpeqb %xmm1, D(%xmm2) 911 psubb %xmm0, D(%xmm2) 912 pmovmskb %xmm2, %r9d 913 shr %cl, %edx 914 shr %cl, %r9d 915 sub %r9d, %edx 916 jnz LABEL(less32bytes) 917 movdqa (%rdi), %xmm3 918 919 UPDATE_STRNCMP_COUNTER 920 921 mov $16, %rcx /* index for loads */ 922 mov $7, %r9d /* byte position left over from less32bytes case */ 923 /* 924 * Setup %r10 value allows us to detect crossing a page boundary. 925 * When %r10 goes positive we have crossed a page boundary and 926 * need to do a nibble. 927 */ 928 lea 7(%rdi), %r10 929 and $0xfff, %r10 /* offset into 4K page */ 930 sub $0x1000, %r10 /* subtract 4K pagesize */ 931 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 932 933 .p2align 4 934LABEL(loop_ashr_7_use): 935 add $16, %r10 936 jg LABEL(nibble_ashr_7_use) 937 938LABEL(nibble_ashr_7_restart_use): 939 movdqa (%rdi, %rdx), %xmm0 940 palignr $7, -16(%rdi, %rdx), D(%xmm0) 941#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 942 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 943#else 944 movdqa (%rsi,%rdx), %xmm1 945 TOLOWER (%xmm0, %xmm1) 946 pcmpistri $0x1a, %xmm1, %xmm0 947#endif 948 jbe LABEL(exit_use) 949#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 950 sub $16, %r11 951 jbe LABEL(strcmp_exitz) 952#endif 953 954 add $16, %rdx 955 add $16, %r10 956 jg LABEL(nibble_ashr_7_use) 957 958 movdqa (%rdi, %rdx), %xmm0 959 palignr $7, -16(%rdi, %rdx), D(%xmm0) 960#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 961 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 962#else 963 movdqa (%rsi,%rdx), %xmm1 964 TOLOWER (%xmm0, %xmm1) 965 pcmpistri $0x1a, %xmm1, %xmm0 966#endif 967 jbe LABEL(exit_use) 968#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 969 sub $16, %r11 970 jbe LABEL(strcmp_exitz) 971#endif 972 add $16, %rdx 973 jmp LABEL(loop_ashr_7_use) 974 975 .p2align 4 976LABEL(nibble_ashr_7_use): 977 sub $0x1000, %r10 978 movdqa -16(%rdi, %rdx), %xmm0 979 psrldq $7, D(%xmm0) 980 pcmpistri $0x3a,%xmm0, %xmm0 981#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 982 cmp %r11, %rcx 983 jae LABEL(nibble_ashr_exit_use) 984#endif 985 cmp $8, %ecx 986 ja LABEL(nibble_ashr_7_restart_use) 987 988 jmp LABEL(nibble_ashr_exit_use) 989 990/* 991 * The following cases will be handled by ashr_8 992 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 993 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 994 */ 995 .p2align 4 996LABEL(ashr_8): 997 pslldq $8, D(%xmm2) 998 TOLOWER (%xmm1, %xmm2) 999 pcmpeqb %xmm1, D(%xmm2) 1000 psubb %xmm0, D(%xmm2) 1001 pmovmskb %xmm2, %r9d 1002 shr %cl, %edx 1003 shr %cl, %r9d 1004 sub %r9d, %edx 1005 jnz LABEL(less32bytes) 1006 movdqa (%rdi), %xmm3 1007 1008 UPDATE_STRNCMP_COUNTER 1009 1010 mov $16, %rcx /* index for loads */ 1011 mov $8, %r9d /* byte position left over from less32bytes case */ 1012 /* 1013 * Setup %r10 value allows us to detect crossing a page boundary. 1014 * When %r10 goes positive we have crossed a page boundary and 1015 * need to do a nibble. 1016 */ 1017 lea 8(%rdi), %r10 1018 and $0xfff, %r10 /* offset into 4K page */ 1019 sub $0x1000, %r10 /* subtract 4K pagesize */ 1020 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1021 1022 .p2align 4 1023LABEL(loop_ashr_8_use): 1024 add $16, %r10 1025 jg LABEL(nibble_ashr_8_use) 1026 1027LABEL(nibble_ashr_8_restart_use): 1028 movdqa (%rdi, %rdx), %xmm0 1029 palignr $8, -16(%rdi, %rdx), D(%xmm0) 1030#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1031 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1032#else 1033 movdqa (%rsi,%rdx), %xmm1 1034 TOLOWER (%xmm0, %xmm1) 1035 pcmpistri $0x1a, %xmm1, %xmm0 1036#endif 1037 jbe LABEL(exit_use) 1038#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1039 sub $16, %r11 1040 jbe LABEL(strcmp_exitz) 1041#endif 1042 1043 add $16, %rdx 1044 add $16, %r10 1045 jg LABEL(nibble_ashr_8_use) 1046 1047 movdqa (%rdi, %rdx), %xmm0 1048 palignr $8, -16(%rdi, %rdx), D(%xmm0) 1049#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1050 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1051#else 1052 movdqa (%rsi,%rdx), %xmm1 1053 TOLOWER (%xmm0, %xmm1) 1054 pcmpistri $0x1a, %xmm1, %xmm0 1055#endif 1056 jbe LABEL(exit_use) 1057#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1058 sub $16, %r11 1059 jbe LABEL(strcmp_exitz) 1060#endif 1061 add $16, %rdx 1062 jmp LABEL(loop_ashr_8_use) 1063 1064 .p2align 4 1065LABEL(nibble_ashr_8_use): 1066 sub $0x1000, %r10 1067 movdqa -16(%rdi, %rdx), %xmm0 1068 psrldq $8, D(%xmm0) 1069 pcmpistri $0x3a,%xmm0, %xmm0 1070#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1071 cmp %r11, %rcx 1072 jae LABEL(nibble_ashr_exit_use) 1073#endif 1074 cmp $7, %ecx 1075 ja LABEL(nibble_ashr_8_restart_use) 1076 1077 jmp LABEL(nibble_ashr_exit_use) 1078 1079/* 1080 * The following cases will be handled by ashr_9 1081 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1082 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 1083 */ 1084 .p2align 4 1085LABEL(ashr_9): 1086 pslldq $7, D(%xmm2) 1087 TOLOWER (%xmm1, %xmm2) 1088 pcmpeqb %xmm1, D(%xmm2) 1089 psubb %xmm0, D(%xmm2) 1090 pmovmskb %xmm2, %r9d 1091 shr %cl, %edx 1092 shr %cl, %r9d 1093 sub %r9d, %edx 1094 jnz LABEL(less32bytes) 1095 movdqa (%rdi), %xmm3 1096 1097 UPDATE_STRNCMP_COUNTER 1098 1099 mov $16, %rcx /* index for loads */ 1100 mov $9, %r9d /* byte position left over from less32bytes case */ 1101 /* 1102 * Setup %r10 value allows us to detect crossing a page boundary. 1103 * When %r10 goes positive we have crossed a page boundary and 1104 * need to do a nibble. 1105 */ 1106 lea 9(%rdi), %r10 1107 and $0xfff, %r10 /* offset into 4K page */ 1108 sub $0x1000, %r10 /* subtract 4K pagesize */ 1109 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1110 1111 .p2align 4 1112LABEL(loop_ashr_9_use): 1113 add $16, %r10 1114 jg LABEL(nibble_ashr_9_use) 1115 1116LABEL(nibble_ashr_9_restart_use): 1117 movdqa (%rdi, %rdx), %xmm0 1118 1119 palignr $9, -16(%rdi, %rdx), D(%xmm0) 1120#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1121 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1122#else 1123 movdqa (%rsi,%rdx), %xmm1 1124 TOLOWER (%xmm0, %xmm1) 1125 pcmpistri $0x1a, %xmm1, %xmm0 1126#endif 1127 jbe LABEL(exit_use) 1128#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1129 sub $16, %r11 1130 jbe LABEL(strcmp_exitz) 1131#endif 1132 1133 add $16, %rdx 1134 add $16, %r10 1135 jg LABEL(nibble_ashr_9_use) 1136 1137 movdqa (%rdi, %rdx), %xmm0 1138 palignr $9, -16(%rdi, %rdx), D(%xmm0) 1139#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1140 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1141#else 1142 movdqa (%rsi,%rdx), %xmm1 1143 TOLOWER (%xmm0, %xmm1) 1144 pcmpistri $0x1a, %xmm1, %xmm0 1145#endif 1146 jbe LABEL(exit_use) 1147#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1148 sub $16, %r11 1149 jbe LABEL(strcmp_exitz) 1150#endif 1151 add $16, %rdx 1152 jmp LABEL(loop_ashr_9_use) 1153 1154 .p2align 4 1155LABEL(nibble_ashr_9_use): 1156 sub $0x1000, %r10 1157 movdqa -16(%rdi, %rdx), %xmm0 1158 psrldq $9, D(%xmm0) 1159 pcmpistri $0x3a,%xmm0, %xmm0 1160#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1161 cmp %r11, %rcx 1162 jae LABEL(nibble_ashr_exit_use) 1163#endif 1164 cmp $6, %ecx 1165 ja LABEL(nibble_ashr_9_restart_use) 1166 1167 jmp LABEL(nibble_ashr_exit_use) 1168 1169/* 1170 * The following cases will be handled by ashr_10 1171 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1172 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 1173 */ 1174 .p2align 4 1175LABEL(ashr_10): 1176 pslldq $6, D(%xmm2) 1177 TOLOWER (%xmm1, %xmm2) 1178 pcmpeqb %xmm1, D(%xmm2) 1179 psubb %xmm0, D(%xmm2) 1180 pmovmskb %xmm2, %r9d 1181 shr %cl, %edx 1182 shr %cl, %r9d 1183 sub %r9d, %edx 1184 jnz LABEL(less32bytes) 1185 movdqa (%rdi), %xmm3 1186 1187 UPDATE_STRNCMP_COUNTER 1188 1189 mov $16, %rcx /* index for loads */ 1190 mov $10, %r9d /* byte position left over from less32bytes case */ 1191 /* 1192 * Setup %r10 value allows us to detect crossing a page boundary. 1193 * When %r10 goes positive we have crossed a page boundary and 1194 * need to do a nibble. 1195 */ 1196 lea 10(%rdi), %r10 1197 and $0xfff, %r10 /* offset into 4K page */ 1198 sub $0x1000, %r10 /* subtract 4K pagesize */ 1199 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1200 1201 .p2align 4 1202LABEL(loop_ashr_10_use): 1203 add $16, %r10 1204 jg LABEL(nibble_ashr_10_use) 1205 1206LABEL(nibble_ashr_10_restart_use): 1207 movdqa (%rdi, %rdx), %xmm0 1208 palignr $10, -16(%rdi, %rdx), D(%xmm0) 1209#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1210 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1211#else 1212 movdqa (%rsi,%rdx), %xmm1 1213 TOLOWER (%xmm0, %xmm1) 1214 pcmpistri $0x1a, %xmm1, %xmm0 1215#endif 1216 jbe LABEL(exit_use) 1217#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1218 sub $16, %r11 1219 jbe LABEL(strcmp_exitz) 1220#endif 1221 1222 add $16, %rdx 1223 add $16, %r10 1224 jg LABEL(nibble_ashr_10_use) 1225 1226 movdqa (%rdi, %rdx), %xmm0 1227 palignr $10, -16(%rdi, %rdx), D(%xmm0) 1228#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1229 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1230#else 1231 movdqa (%rsi,%rdx), %xmm1 1232 TOLOWER (%xmm0, %xmm1) 1233 pcmpistri $0x1a, %xmm1, %xmm0 1234#endif 1235 jbe LABEL(exit_use) 1236#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1237 sub $16, %r11 1238 jbe LABEL(strcmp_exitz) 1239#endif 1240 add $16, %rdx 1241 jmp LABEL(loop_ashr_10_use) 1242 1243 .p2align 4 1244LABEL(nibble_ashr_10_use): 1245 sub $0x1000, %r10 1246 movdqa -16(%rdi, %rdx), %xmm0 1247 psrldq $10, D(%xmm0) 1248 pcmpistri $0x3a,%xmm0, %xmm0 1249#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1250 cmp %r11, %rcx 1251 jae LABEL(nibble_ashr_exit_use) 1252#endif 1253 cmp $5, %ecx 1254 ja LABEL(nibble_ashr_10_restart_use) 1255 1256 jmp LABEL(nibble_ashr_exit_use) 1257 1258/* 1259 * The following cases will be handled by ashr_11 1260 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1261 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 1262 */ 1263 .p2align 4 1264LABEL(ashr_11): 1265 pslldq $5, D(%xmm2) 1266 TOLOWER (%xmm1, %xmm2) 1267 pcmpeqb %xmm1, D(%xmm2) 1268 psubb %xmm0, D(%xmm2) 1269 pmovmskb %xmm2, %r9d 1270 shr %cl, %edx 1271 shr %cl, %r9d 1272 sub %r9d, %edx 1273 jnz LABEL(less32bytes) 1274 movdqa (%rdi), %xmm3 1275 1276 UPDATE_STRNCMP_COUNTER 1277 1278 mov $16, %rcx /* index for loads */ 1279 mov $11, %r9d /* byte position left over from less32bytes case */ 1280 /* 1281 * Setup %r10 value allows us to detect crossing a page boundary. 1282 * When %r10 goes positive we have crossed a page boundary and 1283 * need to do a nibble. 1284 */ 1285 lea 11(%rdi), %r10 1286 and $0xfff, %r10 /* offset into 4K page */ 1287 sub $0x1000, %r10 /* subtract 4K pagesize */ 1288 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1289 1290 .p2align 4 1291LABEL(loop_ashr_11_use): 1292 add $16, %r10 1293 jg LABEL(nibble_ashr_11_use) 1294 1295LABEL(nibble_ashr_11_restart_use): 1296 movdqa (%rdi, %rdx), %xmm0 1297 palignr $11, -16(%rdi, %rdx), D(%xmm0) 1298#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1299 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1300#else 1301 movdqa (%rsi,%rdx), %xmm1 1302 TOLOWER (%xmm0, %xmm1) 1303 pcmpistri $0x1a, %xmm1, %xmm0 1304#endif 1305 jbe LABEL(exit_use) 1306#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1307 sub $16, %r11 1308 jbe LABEL(strcmp_exitz) 1309#endif 1310 1311 add $16, %rdx 1312 add $16, %r10 1313 jg LABEL(nibble_ashr_11_use) 1314 1315 movdqa (%rdi, %rdx), %xmm0 1316 palignr $11, -16(%rdi, %rdx), D(%xmm0) 1317#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1318 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1319#else 1320 movdqa (%rsi,%rdx), %xmm1 1321 TOLOWER (%xmm0, %xmm1) 1322 pcmpistri $0x1a, %xmm1, %xmm0 1323#endif 1324 jbe LABEL(exit_use) 1325#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1326 sub $16, %r11 1327 jbe LABEL(strcmp_exitz) 1328#endif 1329 add $16, %rdx 1330 jmp LABEL(loop_ashr_11_use) 1331 1332 .p2align 4 1333LABEL(nibble_ashr_11_use): 1334 sub $0x1000, %r10 1335 movdqa -16(%rdi, %rdx), %xmm0 1336 psrldq $11, D(%xmm0) 1337 pcmpistri $0x3a,%xmm0, %xmm0 1338#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1339 cmp %r11, %rcx 1340 jae LABEL(nibble_ashr_exit_use) 1341#endif 1342 cmp $4, %ecx 1343 ja LABEL(nibble_ashr_11_restart_use) 1344 1345 jmp LABEL(nibble_ashr_exit_use) 1346 1347/* 1348 * The following cases will be handled by ashr_12 1349 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1350 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 1351 */ 1352 .p2align 4 1353LABEL(ashr_12): 1354 pslldq $4, D(%xmm2) 1355 TOLOWER (%xmm1, %xmm2) 1356 pcmpeqb %xmm1, D(%xmm2) 1357 psubb %xmm0, D(%xmm2) 1358 pmovmskb %xmm2, %r9d 1359 shr %cl, %edx 1360 shr %cl, %r9d 1361 sub %r9d, %edx 1362 jnz LABEL(less32bytes) 1363 movdqa (%rdi), %xmm3 1364 1365 UPDATE_STRNCMP_COUNTER 1366 1367 mov $16, %rcx /* index for loads */ 1368 mov $12, %r9d /* byte position left over from less32bytes case */ 1369 /* 1370 * Setup %r10 value allows us to detect crossing a page boundary. 1371 * When %r10 goes positive we have crossed a page boundary and 1372 * need to do a nibble. 1373 */ 1374 lea 12(%rdi), %r10 1375 and $0xfff, %r10 /* offset into 4K page */ 1376 sub $0x1000, %r10 /* subtract 4K pagesize */ 1377 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1378 1379 .p2align 4 1380LABEL(loop_ashr_12_use): 1381 add $16, %r10 1382 jg LABEL(nibble_ashr_12_use) 1383 1384LABEL(nibble_ashr_12_restart_use): 1385 movdqa (%rdi, %rdx), %xmm0 1386 palignr $12, -16(%rdi, %rdx), D(%xmm0) 1387#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1388 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1389#else 1390 movdqa (%rsi,%rdx), %xmm1 1391 TOLOWER (%xmm0, %xmm1) 1392 pcmpistri $0x1a, %xmm1, %xmm0 1393#endif 1394 jbe LABEL(exit_use) 1395#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1396 sub $16, %r11 1397 jbe LABEL(strcmp_exitz) 1398#endif 1399 1400 add $16, %rdx 1401 add $16, %r10 1402 jg LABEL(nibble_ashr_12_use) 1403 1404 movdqa (%rdi, %rdx), %xmm0 1405 palignr $12, -16(%rdi, %rdx), D(%xmm0) 1406#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1407 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1408#else 1409 movdqa (%rsi,%rdx), %xmm1 1410 TOLOWER (%xmm0, %xmm1) 1411 pcmpistri $0x1a, %xmm1, %xmm0 1412#endif 1413 jbe LABEL(exit_use) 1414#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1415 sub $16, %r11 1416 jbe LABEL(strcmp_exitz) 1417#endif 1418 add $16, %rdx 1419 jmp LABEL(loop_ashr_12_use) 1420 1421 .p2align 4 1422LABEL(nibble_ashr_12_use): 1423 sub $0x1000, %r10 1424 movdqa -16(%rdi, %rdx), %xmm0 1425 psrldq $12, D(%xmm0) 1426 pcmpistri $0x3a,%xmm0, %xmm0 1427#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1428 cmp %r11, %rcx 1429 jae LABEL(nibble_ashr_exit_use) 1430#endif 1431 cmp $3, %ecx 1432 ja LABEL(nibble_ashr_12_restart_use) 1433 1434 jmp LABEL(nibble_ashr_exit_use) 1435 1436/* 1437 * The following cases will be handled by ashr_13 1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1439 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 1440 */ 1441 .p2align 4 1442LABEL(ashr_13): 1443 pslldq $3, D(%xmm2) 1444 TOLOWER (%xmm1, %xmm2) 1445 pcmpeqb %xmm1, D(%xmm2) 1446 psubb %xmm0, D(%xmm2) 1447 pmovmskb %xmm2, %r9d 1448 shr %cl, %edx 1449 shr %cl, %r9d 1450 sub %r9d, %edx 1451 jnz LABEL(less32bytes) 1452 movdqa (%rdi), %xmm3 1453 1454 UPDATE_STRNCMP_COUNTER 1455 1456 mov $16, %rcx /* index for loads */ 1457 mov $13, %r9d /* byte position left over from less32bytes case */ 1458 /* 1459 * Setup %r10 value allows us to detect crossing a page boundary. 1460 * When %r10 goes positive we have crossed a page boundary and 1461 * need to do a nibble. 1462 */ 1463 lea 13(%rdi), %r10 1464 and $0xfff, %r10 /* offset into 4K page */ 1465 sub $0x1000, %r10 /* subtract 4K pagesize */ 1466 1467 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1468 1469 .p2align 4 1470LABEL(loop_ashr_13_use): 1471 add $16, %r10 1472 jg LABEL(nibble_ashr_13_use) 1473 1474LABEL(nibble_ashr_13_restart_use): 1475 movdqa (%rdi, %rdx), %xmm0 1476 palignr $13, -16(%rdi, %rdx), D(%xmm0) 1477#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1478 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1479#else 1480 movdqa (%rsi,%rdx), %xmm1 1481 TOLOWER (%xmm0, %xmm1) 1482 pcmpistri $0x1a, %xmm1, %xmm0 1483#endif 1484 jbe LABEL(exit_use) 1485#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1486 sub $16, %r11 1487 jbe LABEL(strcmp_exitz) 1488#endif 1489 1490 add $16, %rdx 1491 add $16, %r10 1492 jg LABEL(nibble_ashr_13_use) 1493 1494 movdqa (%rdi, %rdx), %xmm0 1495 palignr $13, -16(%rdi, %rdx), D(%xmm0) 1496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1497 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1498#else 1499 movdqa (%rsi,%rdx), %xmm1 1500 TOLOWER (%xmm0, %xmm1) 1501 pcmpistri $0x1a, %xmm1, %xmm0 1502#endif 1503 jbe LABEL(exit_use) 1504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1505 sub $16, %r11 1506 jbe LABEL(strcmp_exitz) 1507#endif 1508 add $16, %rdx 1509 jmp LABEL(loop_ashr_13_use) 1510 1511 .p2align 4 1512LABEL(nibble_ashr_13_use): 1513 sub $0x1000, %r10 1514 movdqa -16(%rdi, %rdx), %xmm0 1515 psrldq $13, D(%xmm0) 1516 pcmpistri $0x3a,%xmm0, %xmm0 1517#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1518 cmp %r11, %rcx 1519 jae LABEL(nibble_ashr_exit_use) 1520#endif 1521 cmp $2, %ecx 1522 ja LABEL(nibble_ashr_13_restart_use) 1523 1524 jmp LABEL(nibble_ashr_exit_use) 1525 1526/* 1527 * The following cases will be handled by ashr_14 1528 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1529 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 1530 */ 1531 .p2align 4 1532LABEL(ashr_14): 1533 pslldq $2, D(%xmm2) 1534 TOLOWER (%xmm1, %xmm2) 1535 pcmpeqb %xmm1, D(%xmm2) 1536 psubb %xmm0, D(%xmm2) 1537 pmovmskb %xmm2, %r9d 1538 shr %cl, %edx 1539 shr %cl, %r9d 1540 sub %r9d, %edx 1541 jnz LABEL(less32bytes) 1542 movdqa (%rdi), %xmm3 1543 1544 UPDATE_STRNCMP_COUNTER 1545 1546 mov $16, %rcx /* index for loads */ 1547 mov $14, %r9d /* byte position left over from less32bytes case */ 1548 /* 1549 * Setup %r10 value allows us to detect crossing a page boundary. 1550 * When %r10 goes positive we have crossed a page boundary and 1551 * need to do a nibble. 1552 */ 1553 lea 14(%rdi), %r10 1554 and $0xfff, %r10 /* offset into 4K page */ 1555 sub $0x1000, %r10 /* subtract 4K pagesize */ 1556 1557 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1558 1559 .p2align 4 1560LABEL(loop_ashr_14_use): 1561 add $16, %r10 1562 jg LABEL(nibble_ashr_14_use) 1563 1564LABEL(nibble_ashr_14_restart_use): 1565 movdqa (%rdi, %rdx), %xmm0 1566 palignr $14, -16(%rdi, %rdx), D(%xmm0) 1567#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1568 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1569#else 1570 movdqa (%rsi,%rdx), %xmm1 1571 TOLOWER (%xmm0, %xmm1) 1572 pcmpistri $0x1a, %xmm1, %xmm0 1573#endif 1574 jbe LABEL(exit_use) 1575#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1576 sub $16, %r11 1577 jbe LABEL(strcmp_exitz) 1578#endif 1579 1580 add $16, %rdx 1581 add $16, %r10 1582 jg LABEL(nibble_ashr_14_use) 1583 1584 movdqa (%rdi, %rdx), %xmm0 1585 palignr $14, -16(%rdi, %rdx), D(%xmm0) 1586#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1587 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1588#else 1589 movdqa (%rsi,%rdx), %xmm1 1590 TOLOWER (%xmm0, %xmm1) 1591 pcmpistri $0x1a, %xmm1, %xmm0 1592#endif 1593 jbe LABEL(exit_use) 1594#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1595 sub $16, %r11 1596 jbe LABEL(strcmp_exitz) 1597#endif 1598 add $16, %rdx 1599 jmp LABEL(loop_ashr_14_use) 1600 1601 .p2align 4 1602LABEL(nibble_ashr_14_use): 1603 sub $0x1000, %r10 1604 movdqa -16(%rdi, %rdx), %xmm0 1605 psrldq $14, D(%xmm0) 1606 pcmpistri $0x3a,%xmm0, %xmm0 1607#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1608 cmp %r11, %rcx 1609 jae LABEL(nibble_ashr_exit_use) 1610#endif 1611 cmp $1, %ecx 1612 ja LABEL(nibble_ashr_14_restart_use) 1613 1614 jmp LABEL(nibble_ashr_exit_use) 1615 1616/* 1617 * The following cases will be handled by ashr_15 1618 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1619 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 1620 */ 1621 .p2align 4 1622LABEL(ashr_15): 1623 pslldq $1, D(%xmm2) 1624 TOLOWER (%xmm1, %xmm2) 1625 pcmpeqb %xmm1, D(%xmm2) 1626 psubb %xmm0, D(%xmm2) 1627 pmovmskb %xmm2, %r9d 1628 shr %cl, %edx 1629 shr %cl, %r9d 1630 sub %r9d, %edx 1631 jnz LABEL(less32bytes) 1632 1633 movdqa (%rdi), %xmm3 1634 1635 UPDATE_STRNCMP_COUNTER 1636 1637 mov $16, %rcx /* index for loads */ 1638 mov $15, %r9d /* byte position left over from less32bytes case */ 1639 /* 1640 * Setup %r10 value allows us to detect crossing a page boundary. 1641 * When %r10 goes positive we have crossed a page boundary and 1642 * need to do a nibble. 1643 */ 1644 lea 15(%rdi), %r10 1645 and $0xfff, %r10 /* offset into 4K page */ 1646 1647 sub $0x1000, %r10 /* subtract 4K pagesize */ 1648 1649 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ 1650 1651 .p2align 4 1652LABEL(loop_ashr_15_use): 1653 add $16, %r10 1654 jg LABEL(nibble_ashr_15_use) 1655 1656LABEL(nibble_ashr_15_restart_use): 1657 movdqa (%rdi, %rdx), %xmm0 1658 palignr $15, -16(%rdi, %rdx), D(%xmm0) 1659#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1660 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1661#else 1662 movdqa (%rsi,%rdx), %xmm1 1663 TOLOWER (%xmm0, %xmm1) 1664 pcmpistri $0x1a, %xmm1, %xmm0 1665#endif 1666 jbe LABEL(exit_use) 1667#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1668 sub $16, %r11 1669 jbe LABEL(strcmp_exitz) 1670#endif 1671 1672 add $16, %rdx 1673 add $16, %r10 1674 jg LABEL(nibble_ashr_15_use) 1675 1676 movdqa (%rdi, %rdx), %xmm0 1677 palignr $15, -16(%rdi, %rdx), D(%xmm0) 1678#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1679 pcmpistri $0x1a, (%rsi,%rdx), %xmm0 1680#else 1681 movdqa (%rsi,%rdx), %xmm1 1682 TOLOWER (%xmm0, %xmm1) 1683 pcmpistri $0x1a, %xmm1, %xmm0 1684#endif 1685 jbe LABEL(exit_use) 1686#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1687 sub $16, %r11 1688 jbe LABEL(strcmp_exitz) 1689#endif 1690 add $16, %rdx 1691 jmp LABEL(loop_ashr_15_use) 1692 1693 .p2align 4 1694LABEL(nibble_ashr_15_use): 1695 sub $0x1000, %r10 1696 movdqa -16(%rdi, %rdx), %xmm0 1697 psrldq $15, D(%xmm0) 1698 pcmpistri $0x3a,%xmm0, %xmm0 1699#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1700 cmp %r11, %rcx 1701 jae LABEL(nibble_ashr_exit_use) 1702#endif 1703 cmp $0, %ecx 1704 ja LABEL(nibble_ashr_15_restart_use) 1705 1706LABEL(nibble_ashr_exit_use): 1707#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L 1708 pcmpistri $0x1a,(%rsi,%rdx), %xmm0 1709#else 1710 movdqa (%rsi,%rdx), %xmm1 1711 TOLOWER (%xmm0, %xmm1) 1712 pcmpistri $0x1a, %xmm1, %xmm0 1713#endif 1714 .p2align 4 1715LABEL(exit_use): 1716 jnc LABEL(strcmp_exitz) 1717#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1718 sub %rcx, %r11 1719 jbe LABEL(strcmp_exitz) 1720#endif 1721 add %rcx, %rdx 1722 lea -16(%rdi, %r9), %rdi 1723 movzbl (%rdi, %rdx), %eax 1724 movzbl (%rsi, %rdx), %edx 1725 test %r8d, %r8d 1726 jz LABEL(ret_use) 1727 xchg %eax, %edx 1728LABEL(ret_use): 1729#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 1730 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx 1731 movl (%rcx,%rdx,4), %edx 1732 movl (%rcx,%rax,4), %eax 1733#endif 1734 1735 sub %edx, %eax 1736 ret 1737 1738LABEL(less32bytes): 1739 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 1740 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 1741 test %r8d, %r8d 1742 jz LABEL(ret) 1743 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 1744 1745 .p2align 4 1746LABEL(ret): 1747LABEL(less16bytes): 1748 bsf %rdx, %rdx /* find and store bit index in %rdx */ 1749 1750#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L 1751 sub %rdx, %r11 1752 jbe LABEL(strcmp_exitz) 1753#endif 1754 movzbl (%rsi, %rdx), %ecx 1755 movzbl (%rdi, %rdx), %eax 1756 1757#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 1758 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 1759 movl (%rdx,%rcx,4), %ecx 1760 movl (%rdx,%rax,4), %eax 1761#endif 1762 1763 sub %ecx, %eax 1764 ret 1765 1766LABEL(strcmp_exitz): 1767 xor %eax, %eax 1768 ret 1769 1770 .p2align 4 1771 // XXX Same as code above 1772LABEL(Byte0): 1773 movzbl (%rsi), %ecx 1774 movzbl (%rdi), %eax 1775 1776#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L 1777 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx 1778 movl (%rdx,%rcx,4), %ecx 1779 movl (%rdx,%rax,4), %eax 1780#endif 1781 1782 sub %ecx, %eax 1783 ret 1784 cfi_endproc 1785 .size STRCMP_SSE42, .-STRCMP_SSE42 1786 1787#undef UCLOW_reg 1788#undef UCHIGH_reg 1789#undef LCQWORD_reg 1790#undef TOLOWER 1791 1792 /* Put all SSE 4.2 functions together. */ 1793 .section .rodata.SECTION,"a",@progbits 1794 .p2align 3 1795LABEL(unaligned_table): 1796 .int LABEL(ashr_1) - LABEL(unaligned_table) 1797 .int LABEL(ashr_2) - LABEL(unaligned_table) 1798 .int LABEL(ashr_3) - LABEL(unaligned_table) 1799 .int LABEL(ashr_4) - LABEL(unaligned_table) 1800 .int LABEL(ashr_5) - LABEL(unaligned_table) 1801 .int LABEL(ashr_6) - LABEL(unaligned_table) 1802 .int LABEL(ashr_7) - LABEL(unaligned_table) 1803 .int LABEL(ashr_8) - LABEL(unaligned_table) 1804 .int LABEL(ashr_9) - LABEL(unaligned_table) 1805 .int LABEL(ashr_10) - LABEL(unaligned_table) 1806 .int LABEL(ashr_11) - LABEL(unaligned_table) 1807 .int LABEL(ashr_12) - LABEL(unaligned_table) 1808 .int LABEL(ashr_13) - LABEL(unaligned_table) 1809 .int LABEL(ashr_14) - LABEL(unaligned_table) 1810 .int LABEL(ashr_15) - LABEL(unaligned_table) 1811 .int LABEL(ashr_0) - LABEL(unaligned_table) 1812 1813#undef LABEL 1814#undef GLABEL 1815#undef SECTION 1816#undef movdqa 1817#undef movdqu 1818#undef pmovmskb 1819#undef pcmpistri 1820#undef psubb 1821#undef pcmpeqb 1822#undef psrldq 1823#undef pslldq 1824#undef palignr 1825#undef pxor 1826#undef D 1827