1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. 2 Copyright (C) 2018-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCMP 24# define STRCMP __strcmp_avx2 25# endif 26 27# define PAGE_SIZE 4096 28 29/* VEC_SIZE = Number of bytes in a ymm register */ 30# define VEC_SIZE 32 31 32/* Shift for dividing by (VEC_SIZE * 4). */ 33# define DIVIDE_BY_VEC_4_SHIFT 7 34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) 35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) 36# endif 37 38# ifdef USE_AS_WCSCMP 39/* Compare packed dwords. */ 40# define VPCMPEQ vpcmpeqd 41/* Compare packed dwords and store minimum. */ 42# define VPMINU vpminud 43/* 1 dword char == 4 bytes. */ 44# define SIZE_OF_CHAR 4 45# else 46/* Compare packed bytes. */ 47# define VPCMPEQ vpcmpeqb 48/* Compare packed bytes and store minimum. */ 49# define VPMINU vpminub 50/* 1 byte char == 1 byte. */ 51# define SIZE_OF_CHAR 1 52# endif 53 54# ifndef VZEROUPPER 55# define VZEROUPPER vzeroupper 56# endif 57 58# ifndef SECTION 59# define SECTION(p) p##.avx 60# endif 61 62/* Warning! 63 wcscmp/wcsncmp have to use SIGNED comparison for elements. 64 strcmp/strncmp have to use UNSIGNED comparison for elements. 65*/ 66 67/* The main idea of the string comparison (byte or dword) using AVX2 68 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on 69 either packed bytes or dwords depending on USE_AS_WCSCMP. In order 70 to check the null char, algorithm keeps the matched bytes/dwords, 71 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, 72 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and 73 one VPMINU instructions, together with movdqu and testl instructions. 74 Main loop (away from from page boundary) compares 4 vectors are a time, 75 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. 76 77 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic 78 is the same as strcmp, except that an a maximum offset is tracked. If 79 the maximum offset is reached before a difference is found, zero is 80 returned. */ 81 82 .section SECTION(.text),"ax",@progbits 83ENTRY (STRCMP) 84# ifdef USE_AS_STRNCMP 85 /* Check for simple cases (0 or 1) in offset. */ 86 cmp $1, %RDX_LP 87 je L(char0) 88 jb L(zero) 89# ifdef USE_AS_WCSCMP 90 /* Convert units: from wide to byte char. */ 91 shl $2, %RDX_LP 92# endif 93 /* Register %r11 tracks the maximum offset. */ 94 mov %RDX_LP, %R11_LP 95# endif 96 movl %edi, %eax 97 xorl %edx, %edx 98 /* Make %xmm7 (%ymm7) all zeros in this function. */ 99 vpxor %xmm7, %xmm7, %xmm7 100 orl %esi, %eax 101 andl $(PAGE_SIZE - 1), %eax 102 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax 103 jg L(cross_page) 104 /* Start comparing 4 vectors. */ 105 vmovdqu (%rdi), %ymm1 106 VPCMPEQ (%rsi), %ymm1, %ymm0 107 VPMINU %ymm1, %ymm0, %ymm0 108 VPCMPEQ %ymm7, %ymm0, %ymm0 109 vpmovmskb %ymm0, %ecx 110 testl %ecx, %ecx 111 je L(next_3_vectors) 112 tzcntl %ecx, %edx 113# ifdef USE_AS_STRNCMP 114 /* Return 0 if the mismatched index (%rdx) is after the maximum 115 offset (%r11). */ 116 cmpq %r11, %rdx 117 jae L(zero) 118# endif 119# ifdef USE_AS_WCSCMP 120 xorl %eax, %eax 121 movl (%rdi, %rdx), %ecx 122 cmpl (%rsi, %rdx), %ecx 123 je L(return) 124L(wcscmp_return): 125 setl %al 126 negl %eax 127 orl $1, %eax 128L(return): 129# else 130 movzbl (%rdi, %rdx), %eax 131 movzbl (%rsi, %rdx), %edx 132 subl %edx, %eax 133# endif 134L(return_vzeroupper): 135 ZERO_UPPER_VEC_REGISTERS_RETURN 136 137 .p2align 4 138L(return_vec_size): 139 tzcntl %ecx, %edx 140# ifdef USE_AS_STRNCMP 141 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after 142 the maximum offset (%r11). */ 143 addq $VEC_SIZE, %rdx 144 cmpq %r11, %rdx 145 jae L(zero) 146# ifdef USE_AS_WCSCMP 147 xorl %eax, %eax 148 movl (%rdi, %rdx), %ecx 149 cmpl (%rsi, %rdx), %ecx 150 jne L(wcscmp_return) 151# else 152 movzbl (%rdi, %rdx), %eax 153 movzbl (%rsi, %rdx), %edx 154 subl %edx, %eax 155# endif 156# else 157# ifdef USE_AS_WCSCMP 158 xorl %eax, %eax 159 movl VEC_SIZE(%rdi, %rdx), %ecx 160 cmpl VEC_SIZE(%rsi, %rdx), %ecx 161 jne L(wcscmp_return) 162# else 163 movzbl VEC_SIZE(%rdi, %rdx), %eax 164 movzbl VEC_SIZE(%rsi, %rdx), %edx 165 subl %edx, %eax 166# endif 167# endif 168 VZEROUPPER_RETURN 169 170 .p2align 4 171L(return_2_vec_size): 172 tzcntl %ecx, %edx 173# ifdef USE_AS_STRNCMP 174 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is 175 after the maximum offset (%r11). */ 176 addq $(VEC_SIZE * 2), %rdx 177 cmpq %r11, %rdx 178 jae L(zero) 179# ifdef USE_AS_WCSCMP 180 xorl %eax, %eax 181 movl (%rdi, %rdx), %ecx 182 cmpl (%rsi, %rdx), %ecx 183 jne L(wcscmp_return) 184# else 185 movzbl (%rdi, %rdx), %eax 186 movzbl (%rsi, %rdx), %edx 187 subl %edx, %eax 188# endif 189# else 190# ifdef USE_AS_WCSCMP 191 xorl %eax, %eax 192 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx 193 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx 194 jne L(wcscmp_return) 195# else 196 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax 197 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx 198 subl %edx, %eax 199# endif 200# endif 201 VZEROUPPER_RETURN 202 203 .p2align 4 204L(return_3_vec_size): 205 tzcntl %ecx, %edx 206# ifdef USE_AS_STRNCMP 207 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is 208 after the maximum offset (%r11). */ 209 addq $(VEC_SIZE * 3), %rdx 210 cmpq %r11, %rdx 211 jae L(zero) 212# ifdef USE_AS_WCSCMP 213 xorl %eax, %eax 214 movl (%rdi, %rdx), %ecx 215 cmpl (%rsi, %rdx), %ecx 216 jne L(wcscmp_return) 217# else 218 movzbl (%rdi, %rdx), %eax 219 movzbl (%rsi, %rdx), %edx 220 subl %edx, %eax 221# endif 222# else 223# ifdef USE_AS_WCSCMP 224 xorl %eax, %eax 225 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx 226 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx 227 jne L(wcscmp_return) 228# else 229 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax 230 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx 231 subl %edx, %eax 232# endif 233# endif 234 VZEROUPPER_RETURN 235 236 .p2align 4 237L(next_3_vectors): 238 vmovdqu VEC_SIZE(%rdi), %ymm6 239 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 240 VPMINU %ymm6, %ymm3, %ymm3 241 VPCMPEQ %ymm7, %ymm3, %ymm3 242 vpmovmskb %ymm3, %ecx 243 testl %ecx, %ecx 244 jne L(return_vec_size) 245 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 246 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 247 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 248 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 249 VPMINU %ymm5, %ymm2, %ymm2 250 VPCMPEQ %ymm4, %ymm0, %ymm0 251 VPCMPEQ %ymm7, %ymm2, %ymm2 252 vpmovmskb %ymm2, %ecx 253 testl %ecx, %ecx 254 jne L(return_2_vec_size) 255 VPMINU %ymm4, %ymm0, %ymm0 256 VPCMPEQ %ymm7, %ymm0, %ymm0 257 vpmovmskb %ymm0, %ecx 258 testl %ecx, %ecx 259 jne L(return_3_vec_size) 260L(main_loop_header): 261 leaq (VEC_SIZE * 4)(%rdi), %rdx 262 movl $PAGE_SIZE, %ecx 263 /* Align load via RAX. */ 264 andq $-(VEC_SIZE * 4), %rdx 265 subq %rdi, %rdx 266 leaq (%rdi, %rdx), %rax 267# ifdef USE_AS_STRNCMP 268 /* Starting from this point, the maximum offset, or simply the 269 'offset', DECREASES by the same amount when base pointers are 270 moved forward. Return 0 when: 271 1) On match: offset <= the matched vector index. 272 2) On mistmach, offset is before the mistmatched index. 273 */ 274 subq %rdx, %r11 275 jbe L(zero) 276# endif 277 addq %rsi, %rdx 278 movq %rdx, %rsi 279 andl $(PAGE_SIZE - 1), %esi 280 /* Number of bytes before page crossing. */ 281 subq %rsi, %rcx 282 /* Number of VEC_SIZE * 4 blocks before page crossing. */ 283 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx 284 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ 285 movl %ecx, %esi 286 jmp L(loop_start) 287 288 .p2align 4 289L(loop): 290# ifdef USE_AS_STRNCMP 291 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease 292 the maximum offset (%r11) by the same amount. */ 293 subq $(VEC_SIZE * 4), %r11 294 jbe L(zero) 295# endif 296 addq $(VEC_SIZE * 4), %rax 297 addq $(VEC_SIZE * 4), %rdx 298L(loop_start): 299 testl %esi, %esi 300 leal -1(%esi), %esi 301 je L(loop_cross_page) 302L(back_to_loop): 303 /* Main loop, comparing 4 vectors are a time. */ 304 vmovdqa (%rax), %ymm0 305 vmovdqa VEC_SIZE(%rax), %ymm3 306 VPCMPEQ (%rdx), %ymm0, %ymm4 307 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 308 VPMINU %ymm0, %ymm4, %ymm4 309 VPMINU %ymm3, %ymm1, %ymm1 310 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 311 VPMINU %ymm1, %ymm4, %ymm0 312 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 313 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 314 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 315 VPMINU %ymm2, %ymm5, %ymm5 316 VPMINU %ymm3, %ymm6, %ymm6 317 VPMINU %ymm5, %ymm0, %ymm0 318 VPMINU %ymm6, %ymm0, %ymm0 319 VPCMPEQ %ymm7, %ymm0, %ymm0 320 321 /* Test each mask (32 bits) individually because for VEC_SIZE 322 == 32 is not possible to OR the four masks and keep all bits 323 in a 64-bit integer register, differing from SSE2 strcmp 324 where ORing is possible. */ 325 vpmovmskb %ymm0, %ecx 326 testl %ecx, %ecx 327 je L(loop) 328 VPCMPEQ %ymm7, %ymm4, %ymm0 329 vpmovmskb %ymm0, %edi 330 testl %edi, %edi 331 je L(test_vec) 332 tzcntl %edi, %ecx 333# ifdef USE_AS_STRNCMP 334 cmpq %rcx, %r11 335 jbe L(zero) 336# ifdef USE_AS_WCSCMP 337 movq %rax, %rsi 338 xorl %eax, %eax 339 movl (%rsi, %rcx), %edi 340 cmpl (%rdx, %rcx), %edi 341 jne L(wcscmp_return) 342# else 343 movzbl (%rax, %rcx), %eax 344 movzbl (%rdx, %rcx), %edx 345 subl %edx, %eax 346# endif 347# else 348# ifdef USE_AS_WCSCMP 349 movq %rax, %rsi 350 xorl %eax, %eax 351 movl (%rsi, %rcx), %edi 352 cmpl (%rdx, %rcx), %edi 353 jne L(wcscmp_return) 354# else 355 movzbl (%rax, %rcx), %eax 356 movzbl (%rdx, %rcx), %edx 357 subl %edx, %eax 358# endif 359# endif 360 VZEROUPPER_RETURN 361 362 .p2align 4 363L(test_vec): 364# ifdef USE_AS_STRNCMP 365 /* The first vector matched. Return 0 if the maximum offset 366 (%r11) <= VEC_SIZE. */ 367 cmpq $VEC_SIZE, %r11 368 jbe L(zero) 369# endif 370 VPCMPEQ %ymm7, %ymm1, %ymm1 371 vpmovmskb %ymm1, %ecx 372 testl %ecx, %ecx 373 je L(test_2_vec) 374 tzcntl %ecx, %edi 375# ifdef USE_AS_STRNCMP 376 addq $VEC_SIZE, %rdi 377 cmpq %rdi, %r11 378 jbe L(zero) 379# ifdef USE_AS_WCSCMP 380 movq %rax, %rsi 381 xorl %eax, %eax 382 movl (%rsi, %rdi), %ecx 383 cmpl (%rdx, %rdi), %ecx 384 jne L(wcscmp_return) 385# else 386 movzbl (%rax, %rdi), %eax 387 movzbl (%rdx, %rdi), %edx 388 subl %edx, %eax 389# endif 390# else 391# ifdef USE_AS_WCSCMP 392 movq %rax, %rsi 393 xorl %eax, %eax 394 movl VEC_SIZE(%rsi, %rdi), %ecx 395 cmpl VEC_SIZE(%rdx, %rdi), %ecx 396 jne L(wcscmp_return) 397# else 398 movzbl VEC_SIZE(%rax, %rdi), %eax 399 movzbl VEC_SIZE(%rdx, %rdi), %edx 400 subl %edx, %eax 401# endif 402# endif 403 VZEROUPPER_RETURN 404 405 .p2align 4 406L(test_2_vec): 407# ifdef USE_AS_STRNCMP 408 /* The first 2 vectors matched. Return 0 if the maximum offset 409 (%r11) <= 2 * VEC_SIZE. */ 410 cmpq $(VEC_SIZE * 2), %r11 411 jbe L(zero) 412# endif 413 VPCMPEQ %ymm7, %ymm5, %ymm5 414 vpmovmskb %ymm5, %ecx 415 testl %ecx, %ecx 416 je L(test_3_vec) 417 tzcntl %ecx, %edi 418# ifdef USE_AS_STRNCMP 419 addq $(VEC_SIZE * 2), %rdi 420 cmpq %rdi, %r11 421 jbe L(zero) 422# ifdef USE_AS_WCSCMP 423 movq %rax, %rsi 424 xorl %eax, %eax 425 movl (%rsi, %rdi), %ecx 426 cmpl (%rdx, %rdi), %ecx 427 jne L(wcscmp_return) 428# else 429 movzbl (%rax, %rdi), %eax 430 movzbl (%rdx, %rdi), %edx 431 subl %edx, %eax 432# endif 433# else 434# ifdef USE_AS_WCSCMP 435 movq %rax, %rsi 436 xorl %eax, %eax 437 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx 438 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx 439 jne L(wcscmp_return) 440# else 441 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax 442 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx 443 subl %edx, %eax 444# endif 445# endif 446 VZEROUPPER_RETURN 447 448 .p2align 4 449L(test_3_vec): 450# ifdef USE_AS_STRNCMP 451 /* The first 3 vectors matched. Return 0 if the maximum offset 452 (%r11) <= 3 * VEC_SIZE. */ 453 cmpq $(VEC_SIZE * 3), %r11 454 jbe L(zero) 455# endif 456 VPCMPEQ %ymm7, %ymm6, %ymm6 457 vpmovmskb %ymm6, %esi 458 tzcntl %esi, %ecx 459# ifdef USE_AS_STRNCMP 460 addq $(VEC_SIZE * 3), %rcx 461 cmpq %rcx, %r11 462 jbe L(zero) 463# ifdef USE_AS_WCSCMP 464 movq %rax, %rsi 465 xorl %eax, %eax 466 movl (%rsi, %rcx), %esi 467 cmpl (%rdx, %rcx), %esi 468 jne L(wcscmp_return) 469# else 470 movzbl (%rax, %rcx), %eax 471 movzbl (%rdx, %rcx), %edx 472 subl %edx, %eax 473# endif 474# else 475# ifdef USE_AS_WCSCMP 476 movq %rax, %rsi 477 xorl %eax, %eax 478 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi 479 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi 480 jne L(wcscmp_return) 481# else 482 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax 483 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx 484 subl %edx, %eax 485# endif 486# endif 487 VZEROUPPER_RETURN 488 489 .p2align 4 490L(loop_cross_page): 491 xorl %r10d, %r10d 492 movq %rdx, %rcx 493 /* Align load via RDX. We load the extra ECX bytes which should 494 be ignored. */ 495 andl $((VEC_SIZE * 4) - 1), %ecx 496 /* R10 is -RCX. */ 497 subq %rcx, %r10 498 499 /* This works only if VEC_SIZE * 2 == 64. */ 500# if (VEC_SIZE * 2) != 64 501# error (VEC_SIZE * 2) != 64 502# endif 503 504 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ 505 cmpl $(VEC_SIZE * 2), %ecx 506 jge L(loop_cross_page_2_vec) 507 508 vmovdqu (%rax, %r10), %ymm2 509 vmovdqu VEC_SIZE(%rax, %r10), %ymm3 510 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 511 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 512 VPMINU %ymm2, %ymm0, %ymm0 513 VPMINU %ymm3, %ymm1, %ymm1 514 VPCMPEQ %ymm7, %ymm0, %ymm0 515 VPCMPEQ %ymm7, %ymm1, %ymm1 516 517 vpmovmskb %ymm0, %edi 518 vpmovmskb %ymm1, %esi 519 520 salq $32, %rsi 521 xorq %rsi, %rdi 522 523 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ 524 shrq %cl, %rdi 525 526 testq %rdi, %rdi 527 je L(loop_cross_page_2_vec) 528 tzcntq %rdi, %rcx 529# ifdef USE_AS_STRNCMP 530 cmpq %rcx, %r11 531 jbe L(zero) 532# ifdef USE_AS_WCSCMP 533 movq %rax, %rsi 534 xorl %eax, %eax 535 movl (%rsi, %rcx), %edi 536 cmpl (%rdx, %rcx), %edi 537 jne L(wcscmp_return) 538# else 539 movzbl (%rax, %rcx), %eax 540 movzbl (%rdx, %rcx), %edx 541 subl %edx, %eax 542# endif 543# else 544# ifdef USE_AS_WCSCMP 545 movq %rax, %rsi 546 xorl %eax, %eax 547 movl (%rsi, %rcx), %edi 548 cmpl (%rdx, %rcx), %edi 549 jne L(wcscmp_return) 550# else 551 movzbl (%rax, %rcx), %eax 552 movzbl (%rdx, %rcx), %edx 553 subl %edx, %eax 554# endif 555# endif 556 VZEROUPPER_RETURN 557 558 .p2align 4 559L(loop_cross_page_2_vec): 560 /* The first VEC_SIZE * 2 bytes match or are ignored. */ 561 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 562 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 563 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 564 VPMINU %ymm2, %ymm5, %ymm5 565 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 566 VPCMPEQ %ymm7, %ymm5, %ymm5 567 VPMINU %ymm3, %ymm6, %ymm6 568 VPCMPEQ %ymm7, %ymm6, %ymm6 569 570 vpmovmskb %ymm5, %edi 571 vpmovmskb %ymm6, %esi 572 573 salq $32, %rsi 574 xorq %rsi, %rdi 575 576 xorl %r8d, %r8d 577 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ 578 subl $(VEC_SIZE * 2), %ecx 579 jle 1f 580 /* Skip ECX bytes. */ 581 shrq %cl, %rdi 582 /* R8 has number of bytes skipped. */ 583 movl %ecx, %r8d 5841: 585 /* Before jumping back to the loop, set ESI to the number of 586 VEC_SIZE * 4 blocks before page crossing. */ 587 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi 588 589 testq %rdi, %rdi 590# ifdef USE_AS_STRNCMP 591 /* At this point, if %rdi value is 0, it already tested 592 VEC_SIZE*4+%r10 byte starting from %rax. This label 593 checks whether strncmp maximum offset reached or not. */ 594 je L(string_nbyte_offset_check) 595# else 596 je L(back_to_loop) 597# endif 598 tzcntq %rdi, %rcx 599 addq %r10, %rcx 600 /* Adjust for number of bytes skipped. */ 601 addq %r8, %rcx 602# ifdef USE_AS_STRNCMP 603 addq $(VEC_SIZE * 2), %rcx 604 subq %rcx, %r11 605 jbe L(zero) 606# ifdef USE_AS_WCSCMP 607 movq %rax, %rsi 608 xorl %eax, %eax 609 movl (%rsi, %rcx), %edi 610 cmpl (%rdx, %rcx), %edi 611 jne L(wcscmp_return) 612# else 613 movzbl (%rax, %rcx), %eax 614 movzbl (%rdx, %rcx), %edx 615 subl %edx, %eax 616# endif 617# else 618# ifdef USE_AS_WCSCMP 619 movq %rax, %rsi 620 xorl %eax, %eax 621 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi 622 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi 623 jne L(wcscmp_return) 624# else 625 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax 626 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx 627 subl %edx, %eax 628# endif 629# endif 630 VZEROUPPER_RETURN 631 632# ifdef USE_AS_STRNCMP 633L(string_nbyte_offset_check): 634 leaq (VEC_SIZE * 4)(%r10), %r10 635 cmpq %r10, %r11 636 jbe L(zero) 637 jmp L(back_to_loop) 638# endif 639 640 .p2align 4 641L(cross_page_loop): 642 /* Check one byte/dword at a time. */ 643# ifdef USE_AS_WCSCMP 644 cmpl %ecx, %eax 645# else 646 subl %ecx, %eax 647# endif 648 jne L(different) 649 addl $SIZE_OF_CHAR, %edx 650 cmpl $(VEC_SIZE * 4), %edx 651 je L(main_loop_header) 652# ifdef USE_AS_STRNCMP 653 cmpq %r11, %rdx 654 jae L(zero) 655# endif 656# ifdef USE_AS_WCSCMP 657 movl (%rdi, %rdx), %eax 658 movl (%rsi, %rdx), %ecx 659# else 660 movzbl (%rdi, %rdx), %eax 661 movzbl (%rsi, %rdx), %ecx 662# endif 663 /* Check null char. */ 664 testl %eax, %eax 665 jne L(cross_page_loop) 666 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED 667 comparisons. */ 668 subl %ecx, %eax 669# ifndef USE_AS_WCSCMP 670L(different): 671# endif 672 VZEROUPPER_RETURN 673 674# ifdef USE_AS_WCSCMP 675 .p2align 4 676L(different): 677 /* Use movl to avoid modifying EFLAGS. */ 678 movl $0, %eax 679 setl %al 680 negl %eax 681 orl $1, %eax 682 VZEROUPPER_RETURN 683# endif 684 685# ifdef USE_AS_STRNCMP 686 .p2align 4 687L(zero): 688 xorl %eax, %eax 689 VZEROUPPER_RETURN 690 691 .p2align 4 692L(char0): 693# ifdef USE_AS_WCSCMP 694 xorl %eax, %eax 695 movl (%rdi), %ecx 696 cmpl (%rsi), %ecx 697 jne L(wcscmp_return) 698# else 699 movzbl (%rsi), %ecx 700 movzbl (%rdi), %eax 701 subl %ecx, %eax 702# endif 703 VZEROUPPER_RETURN 704# endif 705 706 .p2align 4 707L(last_vector): 708 addq %rdx, %rdi 709 addq %rdx, %rsi 710# ifdef USE_AS_STRNCMP 711 subq %rdx, %r11 712# endif 713 tzcntl %ecx, %edx 714# ifdef USE_AS_STRNCMP 715 cmpq %r11, %rdx 716 jae L(zero) 717# endif 718# ifdef USE_AS_WCSCMP 719 xorl %eax, %eax 720 movl (%rdi, %rdx), %ecx 721 cmpl (%rsi, %rdx), %ecx 722 jne L(wcscmp_return) 723# else 724 movzbl (%rdi, %rdx), %eax 725 movzbl (%rsi, %rdx), %edx 726 subl %edx, %eax 727# endif 728 VZEROUPPER_RETURN 729 730 /* Comparing on page boundary region requires special treatment: 731 It must done one vector at the time, starting with the wider 732 ymm vector if possible, if not, with xmm. If fetching 16 bytes 733 (xmm) still passes the boundary, byte comparison must be done. 734 */ 735 .p2align 4 736L(cross_page): 737 /* Try one ymm vector at a time. */ 738 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 739 jg L(cross_page_1_vector) 740L(loop_1_vector): 741 vmovdqu (%rdi, %rdx), %ymm1 742 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 743 VPMINU %ymm1, %ymm0, %ymm0 744 VPCMPEQ %ymm7, %ymm0, %ymm0 745 vpmovmskb %ymm0, %ecx 746 testl %ecx, %ecx 747 jne L(last_vector) 748 749 addl $VEC_SIZE, %edx 750 751 addl $VEC_SIZE, %eax 752# ifdef USE_AS_STRNCMP 753 /* Return 0 if the current offset (%rdx) >= the maximum offset 754 (%r11). */ 755 cmpq %r11, %rdx 756 jae L(zero) 757# endif 758 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 759 jle L(loop_1_vector) 760L(cross_page_1_vector): 761 /* Less than 32 bytes to check, try one xmm vector. */ 762 cmpl $(PAGE_SIZE - 16), %eax 763 jg L(cross_page_1_xmm) 764 vmovdqu (%rdi, %rdx), %xmm1 765 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 766 VPMINU %xmm1, %xmm0, %xmm0 767 VPCMPEQ %xmm7, %xmm0, %xmm0 768 vpmovmskb %xmm0, %ecx 769 testl %ecx, %ecx 770 jne L(last_vector) 771 772 addl $16, %edx 773# ifndef USE_AS_WCSCMP 774 addl $16, %eax 775# endif 776# ifdef USE_AS_STRNCMP 777 /* Return 0 if the current offset (%rdx) >= the maximum offset 778 (%r11). */ 779 cmpq %r11, %rdx 780 jae L(zero) 781# endif 782 783L(cross_page_1_xmm): 784# ifndef USE_AS_WCSCMP 785 /* Less than 16 bytes to check, try 8 byte vector. NB: No need 786 for wcscmp nor wcsncmp since wide char is 4 bytes. */ 787 cmpl $(PAGE_SIZE - 8), %eax 788 jg L(cross_page_8bytes) 789 vmovq (%rdi, %rdx), %xmm1 790 vmovq (%rsi, %rdx), %xmm0 791 VPCMPEQ %xmm0, %xmm1, %xmm0 792 VPMINU %xmm1, %xmm0, %xmm0 793 VPCMPEQ %xmm7, %xmm0, %xmm0 794 vpmovmskb %xmm0, %ecx 795 /* Only last 8 bits are valid. */ 796 andl $0xff, %ecx 797 testl %ecx, %ecx 798 jne L(last_vector) 799 800 addl $8, %edx 801 addl $8, %eax 802# ifdef USE_AS_STRNCMP 803 /* Return 0 if the current offset (%rdx) >= the maximum offset 804 (%r11). */ 805 cmpq %r11, %rdx 806 jae L(zero) 807# endif 808 809L(cross_page_8bytes): 810 /* Less than 8 bytes to check, try 4 byte vector. */ 811 cmpl $(PAGE_SIZE - 4), %eax 812 jg L(cross_page_4bytes) 813 vmovd (%rdi, %rdx), %xmm1 814 vmovd (%rsi, %rdx), %xmm0 815 VPCMPEQ %xmm0, %xmm1, %xmm0 816 VPMINU %xmm1, %xmm0, %xmm0 817 VPCMPEQ %xmm7, %xmm0, %xmm0 818 vpmovmskb %xmm0, %ecx 819 /* Only last 4 bits are valid. */ 820 andl $0xf, %ecx 821 testl %ecx, %ecx 822 jne L(last_vector) 823 824 addl $4, %edx 825# ifdef USE_AS_STRNCMP 826 /* Return 0 if the current offset (%rdx) >= the maximum offset 827 (%r11). */ 828 cmpq %r11, %rdx 829 jae L(zero) 830# endif 831 832L(cross_page_4bytes): 833# endif 834 /* Less than 4 bytes to check, try one byte/dword at a time. */ 835# ifdef USE_AS_STRNCMP 836 cmpq %r11, %rdx 837 jae L(zero) 838# endif 839# ifdef USE_AS_WCSCMP 840 movl (%rdi, %rdx), %eax 841 movl (%rsi, %rdx), %ecx 842# else 843 movzbl (%rdi, %rdx), %eax 844 movzbl (%rsi, %rdx), %ecx 845# endif 846 testl %eax, %eax 847 jne L(cross_page_loop) 848 subl %ecx, %eax 849 VZEROUPPER_RETURN 850END (STRCMP) 851#endif 852