1/* strcpy with AVX2 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# ifndef USE_AS_STRCAT 22# include <sysdep.h> 23 24# ifndef STRCPY 25# define STRCPY __strcpy_avx2 26# endif 27 28# endif 29 30/* Number of bytes in a vector register */ 31# ifndef VEC_SIZE 32# define VEC_SIZE 32 33# endif 34 35# ifndef VZEROUPPER 36# define VZEROUPPER vzeroupper 37# endif 38 39# ifndef SECTION 40# define SECTION(p) p##.avx 41# endif 42 43/* zero register */ 44#define xmmZ xmm0 45#define ymmZ ymm0 46 47/* mask register */ 48#define ymmM ymm1 49 50# ifndef USE_AS_STRCAT 51 52 .section SECTION(.text),"ax",@progbits 53ENTRY (STRCPY) 54# ifdef USE_AS_STRNCPY 55 mov %RDX_LP, %R8_LP 56 test %R8_LP, %R8_LP 57 jz L(ExitZero) 58# endif 59 mov %rsi, %rcx 60# ifndef USE_AS_STPCPY 61 mov %rdi, %rax /* save result */ 62# endif 63 64# endif 65 66 vpxor %xmmZ, %xmmZ, %xmmZ 67 68 and $((VEC_SIZE * 4) - 1), %ecx 69 cmp $(VEC_SIZE * 2), %ecx 70 jbe L(SourceStringAlignmentLessTwoVecSize) 71 72 and $-VEC_SIZE, %rsi 73 and $(VEC_SIZE - 1), %ecx 74 75 vpcmpeqb (%rsi), %ymmZ, %ymmM 76 vpmovmskb %ymmM, %edx 77 shr %cl, %rdx 78 79# ifdef USE_AS_STRNCPY 80# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 81 mov $VEC_SIZE, %r10 82 sub %rcx, %r10 83 cmp %r10, %r8 84# else 85 mov $(VEC_SIZE + 1), %r10 86 sub %rcx, %r10 87 cmp %r10, %r8 88# endif 89 jbe L(CopyVecSizeTailCase2OrCase3) 90# endif 91 test %edx, %edx 92 jnz L(CopyVecSizeTail) 93 94 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 95 vpmovmskb %ymm2, %edx 96 97# ifdef USE_AS_STRNCPY 98 add $VEC_SIZE, %r10 99 cmp %r10, %r8 100 jbe L(CopyTwoVecSizeCase2OrCase3) 101# endif 102 test %edx, %edx 103 jnz L(CopyTwoVecSize) 104 105 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ 106 vmovdqu %ymm2, (%rdi) 107 108/* If source address alignment != destination address alignment */ 109 .p2align 4 110L(UnalignVecSizeBoth): 111 sub %rcx, %rdi 112# ifdef USE_AS_STRNCPY 113 add %rcx, %r8 114 sbb %rcx, %rcx 115 or %rcx, %r8 116# endif 117 mov $VEC_SIZE, %rcx 118 vmovdqa (%rsi, %rcx), %ymm2 119 vmovdqu %ymm2, (%rdi, %rcx) 120 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 121 vpcmpeqb %ymm2, %ymmZ, %ymmM 122 vpmovmskb %ymmM, %edx 123 add $VEC_SIZE, %rcx 124# ifdef USE_AS_STRNCPY 125 sub $(VEC_SIZE * 3), %r8 126 jbe L(CopyVecSizeCase2OrCase3) 127# endif 128 test %edx, %edx 129# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 130 jnz L(CopyVecSizeUnalignedVec2) 131# else 132 jnz L(CopyVecSize) 133# endif 134 135 vmovdqu %ymm2, (%rdi, %rcx) 136 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 137 vpcmpeqb %ymm3, %ymmZ, %ymmM 138 vpmovmskb %ymmM, %edx 139 add $VEC_SIZE, %rcx 140# ifdef USE_AS_STRNCPY 141 sub $VEC_SIZE, %r8 142 jbe L(CopyVecSizeCase2OrCase3) 143# endif 144 test %edx, %edx 145# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 146 jnz L(CopyVecSizeUnalignedVec3) 147# else 148 jnz L(CopyVecSize) 149# endif 150 151 vmovdqu %ymm3, (%rdi, %rcx) 152 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 153 vpcmpeqb %ymm4, %ymmZ, %ymmM 154 vpmovmskb %ymmM, %edx 155 add $VEC_SIZE, %rcx 156# ifdef USE_AS_STRNCPY 157 sub $VEC_SIZE, %r8 158 jbe L(CopyVecSizeCase2OrCase3) 159# endif 160 test %edx, %edx 161# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 162 jnz L(CopyVecSizeUnalignedVec4) 163# else 164 jnz L(CopyVecSize) 165# endif 166 167 vmovdqu %ymm4, (%rdi, %rcx) 168 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 169 vpcmpeqb %ymm2, %ymmZ, %ymmM 170 vpmovmskb %ymmM, %edx 171 add $VEC_SIZE, %rcx 172# ifdef USE_AS_STRNCPY 173 sub $VEC_SIZE, %r8 174 jbe L(CopyVecSizeCase2OrCase3) 175# endif 176 test %edx, %edx 177# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 178 jnz L(CopyVecSizeUnalignedVec2) 179# else 180 jnz L(CopyVecSize) 181# endif 182 183 vmovdqu %ymm2, (%rdi, %rcx) 184 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 185 vpcmpeqb %ymm2, %ymmZ, %ymmM 186 vpmovmskb %ymmM, %edx 187 add $VEC_SIZE, %rcx 188# ifdef USE_AS_STRNCPY 189 sub $VEC_SIZE, %r8 190 jbe L(CopyVecSizeCase2OrCase3) 191# endif 192 test %edx, %edx 193# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 194 jnz L(CopyVecSizeUnalignedVec2) 195# else 196 jnz L(CopyVecSize) 197# endif 198 199 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 200 vmovdqu %ymm2, (%rdi, %rcx) 201 vpcmpeqb %ymm3, %ymmZ, %ymmM 202 vpmovmskb %ymmM, %edx 203 add $VEC_SIZE, %rcx 204# ifdef USE_AS_STRNCPY 205 sub $VEC_SIZE, %r8 206 jbe L(CopyVecSizeCase2OrCase3) 207# endif 208 test %edx, %edx 209# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 210 jnz L(CopyVecSizeUnalignedVec3) 211# else 212 jnz L(CopyVecSize) 213# endif 214 215 vmovdqu %ymm3, (%rdi, %rcx) 216 mov %rsi, %rdx 217 lea VEC_SIZE(%rsi, %rcx), %rsi 218 and $-(VEC_SIZE * 4), %rsi 219 sub %rsi, %rdx 220 sub %rdx, %rdi 221# ifdef USE_AS_STRNCPY 222 lea (VEC_SIZE * 8)(%r8, %rdx), %r8 223# endif 224L(UnalignedFourVecSizeLoop): 225 vmovdqa (%rsi), %ymm4 226 vmovdqa VEC_SIZE(%rsi), %ymm5 227 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 228 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 229 vpminub %ymm5, %ymm4, %ymm2 230 vpminub %ymm7, %ymm6, %ymm3 231 vpminub %ymm2, %ymm3, %ymm3 232 vpcmpeqb %ymmM, %ymm3, %ymm3 233 vpmovmskb %ymm3, %edx 234# ifdef USE_AS_STRNCPY 235 sub $(VEC_SIZE * 4), %r8 236 jbe L(UnalignedLeaveCase2OrCase3) 237# endif 238 test %edx, %edx 239 jnz L(UnalignedFourVecSizeLeave) 240 241L(UnalignedFourVecSizeLoop_start): 242 add $(VEC_SIZE * 4), %rdi 243 add $(VEC_SIZE * 4), %rsi 244 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) 245 vmovdqa (%rsi), %ymm4 246 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) 247 vmovdqa VEC_SIZE(%rsi), %ymm5 248 vpminub %ymm5, %ymm4, %ymm2 249 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) 250 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 251 vmovdqu %ymm7, -VEC_SIZE(%rdi) 252 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 253 vpminub %ymm7, %ymm6, %ymm3 254 vpminub %ymm2, %ymm3, %ymm3 255 vpcmpeqb %ymmM, %ymm3, %ymm3 256 vpmovmskb %ymm3, %edx 257# ifdef USE_AS_STRNCPY 258 sub $(VEC_SIZE * 4), %r8 259 jbe L(UnalignedLeaveCase2OrCase3) 260# endif 261 test %edx, %edx 262 jz L(UnalignedFourVecSizeLoop_start) 263 264L(UnalignedFourVecSizeLeave): 265 vpcmpeqb %ymm4, %ymmZ, %ymmM 266 vpmovmskb %ymmM, %edx 267 test %edx, %edx 268 jnz L(CopyVecSizeUnaligned_0) 269 270 vpcmpeqb %ymm5, %ymmZ, %ymmM 271 vpmovmskb %ymmM, %ecx 272 test %ecx, %ecx 273 jnz L(CopyVecSizeUnaligned_16) 274 275 vpcmpeqb %ymm6, %ymmZ, %ymmM 276 vpmovmskb %ymmM, %edx 277 test %edx, %edx 278 jnz L(CopyVecSizeUnaligned_32) 279 280 vpcmpeqb %ymm7, %ymmZ, %ymmM 281 vpmovmskb %ymmM, %ecx 282 bsf %ecx, %edx 283 vmovdqu %ymm4, (%rdi) 284 vmovdqu %ymm5, VEC_SIZE(%rdi) 285 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 286# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 287# ifdef USE_AS_STPCPY 288 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax 289# endif 290 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) 291 add $(VEC_SIZE - 1), %r8 292 sub %rdx, %r8 293 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi 294 jmp L(StrncpyFillTailWithZero) 295# else 296 add $(VEC_SIZE * 3), %rsi 297 add $(VEC_SIZE * 3), %rdi 298 jmp L(CopyVecSizeExit) 299# endif 300 301/* If source address alignment == destination address alignment */ 302 303L(SourceStringAlignmentLessTwoVecSize): 304 vmovdqu (%rsi), %ymm3 305 vmovdqu VEC_SIZE(%rsi), %ymm2 306 vpcmpeqb %ymm3, %ymmZ, %ymmM 307 vpmovmskb %ymmM, %edx 308 309# ifdef USE_AS_STRNCPY 310# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 311 cmp $VEC_SIZE, %r8 312# else 313 cmp $(VEC_SIZE + 1), %r8 314# endif 315 jbe L(CopyVecSizeTail1Case2OrCase3) 316# endif 317 test %edx, %edx 318 jnz L(CopyVecSizeTail1) 319 320 vmovdqu %ymm3, (%rdi) 321 vpcmpeqb %ymm2, %ymmZ, %ymmM 322 vpmovmskb %ymmM, %edx 323 324# ifdef USE_AS_STRNCPY 325# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 326 cmp $(VEC_SIZE * 2), %r8 327# else 328 cmp $((VEC_SIZE * 2) + 1), %r8 329# endif 330 jbe L(CopyTwoVecSize1Case2OrCase3) 331# endif 332 test %edx, %edx 333 jnz L(CopyTwoVecSize1) 334 335 and $-VEC_SIZE, %rsi 336 and $(VEC_SIZE - 1), %ecx 337 jmp L(UnalignVecSizeBoth) 338 339/*------End of main part with loops---------------------*/ 340 341/* Case1 */ 342 343# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) 344 .p2align 4 345L(CopyVecSize): 346 add %rcx, %rdi 347# endif 348L(CopyVecSizeTail): 349 add %rcx, %rsi 350L(CopyVecSizeTail1): 351 bsf %edx, %edx 352L(CopyVecSizeExit): 353 cmp $32, %edx 354 jae L(Exit32_63) 355 cmp $16, %edx 356 jae L(Exit16_31) 357 cmp $8, %edx 358 jae L(Exit8_15) 359 cmp $4, %edx 360 jae L(Exit4_7) 361 cmp $3, %edx 362 je L(Exit3) 363 cmp $1, %edx 364 ja L(Exit2) 365 je L(Exit1) 366 movb $0, (%rdi) 367# ifdef USE_AS_STPCPY 368 lea (%rdi), %rax 369# endif 370# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 371 sub $1, %r8 372 lea 1(%rdi), %rdi 373 jnz L(StrncpyFillTailWithZero) 374# endif 375L(return_vzeroupper): 376 ZERO_UPPER_VEC_REGISTERS_RETURN 377 378 .p2align 4 379L(CopyTwoVecSize1): 380 add $VEC_SIZE, %rsi 381 add $VEC_SIZE, %rdi 382# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 383 sub $VEC_SIZE, %r8 384# endif 385 jmp L(CopyVecSizeTail1) 386 387 .p2align 4 388L(CopyTwoVecSize): 389 bsf %edx, %edx 390 add %rcx, %rsi 391 add $VEC_SIZE, %edx 392 sub %ecx, %edx 393 jmp L(CopyVecSizeExit) 394 395 .p2align 4 396L(CopyVecSizeUnaligned_0): 397 bsf %edx, %edx 398# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 399# ifdef USE_AS_STPCPY 400 lea (%rdi, %rdx), %rax 401# endif 402 vmovdqu %ymm4, (%rdi) 403 add $((VEC_SIZE * 4) - 1), %r8 404 sub %rdx, %r8 405 lea 1(%rdi, %rdx), %rdi 406 jmp L(StrncpyFillTailWithZero) 407# else 408 jmp L(CopyVecSizeExit) 409# endif 410 411 .p2align 4 412L(CopyVecSizeUnaligned_16): 413 bsf %ecx, %edx 414 vmovdqu %ymm4, (%rdi) 415# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 416# ifdef USE_AS_STPCPY 417 lea VEC_SIZE(%rdi, %rdx), %rax 418# endif 419 vmovdqu %ymm5, VEC_SIZE(%rdi) 420 add $((VEC_SIZE * 3) - 1), %r8 421 sub %rdx, %r8 422 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi 423 jmp L(StrncpyFillTailWithZero) 424# else 425 add $VEC_SIZE, %rsi 426 add $VEC_SIZE, %rdi 427 jmp L(CopyVecSizeExit) 428# endif 429 430 .p2align 4 431L(CopyVecSizeUnaligned_32): 432 bsf %edx, %edx 433 vmovdqu %ymm4, (%rdi) 434 vmovdqu %ymm5, VEC_SIZE(%rdi) 435# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 436# ifdef USE_AS_STPCPY 437 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax 438# endif 439 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 440 add $((VEC_SIZE * 2) - 1), %r8 441 sub %rdx, %r8 442 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi 443 jmp L(StrncpyFillTailWithZero) 444# else 445 add $(VEC_SIZE * 2), %rsi 446 add $(VEC_SIZE * 2), %rdi 447 jmp L(CopyVecSizeExit) 448# endif 449 450# ifdef USE_AS_STRNCPY 451# ifndef USE_AS_STRCAT 452 .p2align 4 453L(CopyVecSizeUnalignedVec6): 454 vmovdqu %ymm6, (%rdi, %rcx) 455 jmp L(CopyVecSizeVecExit) 456 457 .p2align 4 458L(CopyVecSizeUnalignedVec5): 459 vmovdqu %ymm5, (%rdi, %rcx) 460 jmp L(CopyVecSizeVecExit) 461 462 .p2align 4 463L(CopyVecSizeUnalignedVec4): 464 vmovdqu %ymm4, (%rdi, %rcx) 465 jmp L(CopyVecSizeVecExit) 466 467 .p2align 4 468L(CopyVecSizeUnalignedVec3): 469 vmovdqu %ymm3, (%rdi, %rcx) 470 jmp L(CopyVecSizeVecExit) 471# endif 472 473/* Case2 */ 474 475 .p2align 4 476L(CopyVecSizeCase2): 477 add $VEC_SIZE, %r8 478 add %rcx, %rdi 479 add %rcx, %rsi 480 bsf %edx, %edx 481 cmp %r8d, %edx 482 jb L(CopyVecSizeExit) 483 jmp L(StrncpyExit) 484 485 .p2align 4 486L(CopyTwoVecSizeCase2): 487 add %rcx, %rsi 488 bsf %edx, %edx 489 add $VEC_SIZE, %edx 490 sub %ecx, %edx 491 cmp %r8d, %edx 492 jb L(CopyVecSizeExit) 493 jmp L(StrncpyExit) 494 495L(CopyVecSizeTailCase2): 496 add %rcx, %rsi 497 bsf %edx, %edx 498 cmp %r8d, %edx 499 jb L(CopyVecSizeExit) 500 jmp L(StrncpyExit) 501 502L(CopyVecSizeTail1Case2): 503 bsf %edx, %edx 504 cmp %r8d, %edx 505 jb L(CopyVecSizeExit) 506 jmp L(StrncpyExit) 507 508/* Case2 or Case3, Case3 */ 509 510 .p2align 4 511L(CopyVecSizeCase2OrCase3): 512 test %rdx, %rdx 513 jnz L(CopyVecSizeCase2) 514L(CopyVecSizeCase3): 515 add $VEC_SIZE, %r8 516 add %rcx, %rdi 517 add %rcx, %rsi 518 jmp L(StrncpyExit) 519 520 .p2align 4 521L(CopyTwoVecSizeCase2OrCase3): 522 test %rdx, %rdx 523 jnz L(CopyTwoVecSizeCase2) 524 add %rcx, %rsi 525 jmp L(StrncpyExit) 526 527 .p2align 4 528L(CopyVecSizeTailCase2OrCase3): 529 test %rdx, %rdx 530 jnz L(CopyVecSizeTailCase2) 531 add %rcx, %rsi 532 jmp L(StrncpyExit) 533 534 .p2align 4 535L(CopyTwoVecSize1Case2OrCase3): 536 add $VEC_SIZE, %rdi 537 add $VEC_SIZE, %rsi 538 sub $VEC_SIZE, %r8 539L(CopyVecSizeTail1Case2OrCase3): 540 test %rdx, %rdx 541 jnz L(CopyVecSizeTail1Case2) 542 jmp L(StrncpyExit) 543# endif 544 545/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ 546 547 .p2align 4 548L(Exit1): 549 movzwl (%rsi), %edx 550 mov %dx, (%rdi) 551# ifdef USE_AS_STPCPY 552 lea 1(%rdi), %rax 553# endif 554# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 555 sub $2, %r8 556 lea 2(%rdi), %rdi 557 jnz L(StrncpyFillTailWithZero) 558# endif 559 VZEROUPPER_RETURN 560 561 .p2align 4 562L(Exit2): 563 movzwl (%rsi), %ecx 564 mov %cx, (%rdi) 565 movb $0, 2(%rdi) 566# ifdef USE_AS_STPCPY 567 lea 2(%rdi), %rax 568# endif 569# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 570 sub $3, %r8 571 lea 3(%rdi), %rdi 572 jnz L(StrncpyFillTailWithZero) 573# endif 574 VZEROUPPER_RETURN 575 576 .p2align 4 577L(Exit3): 578 mov (%rsi), %edx 579 mov %edx, (%rdi) 580# ifdef USE_AS_STPCPY 581 lea 3(%rdi), %rax 582# endif 583# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 584 sub $4, %r8 585 lea 4(%rdi), %rdi 586 jnz L(StrncpyFillTailWithZero) 587# endif 588 VZEROUPPER_RETURN 589 590 .p2align 4 591L(Exit4_7): 592 mov (%rsi), %ecx 593 mov %ecx, (%rdi) 594 mov -3(%rsi, %rdx), %ecx 595 mov %ecx, -3(%rdi, %rdx) 596# ifdef USE_AS_STPCPY 597 lea (%rdi, %rdx), %rax 598# endif 599# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 600 sub %rdx, %r8 601 sub $1, %r8 602 lea 1(%rdi, %rdx), %rdi 603 jnz L(StrncpyFillTailWithZero) 604# endif 605 VZEROUPPER_RETURN 606 607 .p2align 4 608L(Exit8_15): 609 mov (%rsi), %rcx 610 mov -7(%rsi, %rdx), %r9 611 mov %rcx, (%rdi) 612 mov %r9, -7(%rdi, %rdx) 613# ifdef USE_AS_STPCPY 614 lea (%rdi, %rdx), %rax 615# endif 616# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 617 sub %rdx, %r8 618 sub $1, %r8 619 lea 1(%rdi, %rdx), %rdi 620 jnz L(StrncpyFillTailWithZero) 621# endif 622 VZEROUPPER_RETURN 623 624 .p2align 4 625L(Exit16_31): 626 vmovdqu (%rsi), %xmm2 627 vmovdqu -15(%rsi, %rdx), %xmm3 628 vmovdqu %xmm2, (%rdi) 629 vmovdqu %xmm3, -15(%rdi, %rdx) 630# ifdef USE_AS_STPCPY 631 lea (%rdi, %rdx), %rax 632# endif 633# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 634 sub %rdx, %r8 635 sub $1, %r8 636 lea 1(%rdi, %rdx), %rdi 637 jnz L(StrncpyFillTailWithZero) 638# endif 639 VZEROUPPER_RETURN 640 641 .p2align 4 642L(Exit32_63): 643 vmovdqu (%rsi), %ymm2 644 vmovdqu -31(%rsi, %rdx), %ymm3 645 vmovdqu %ymm2, (%rdi) 646 vmovdqu %ymm3, -31(%rdi, %rdx) 647# ifdef USE_AS_STPCPY 648 lea (%rdi, %rdx), %rax 649# endif 650# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 651 sub %rdx, %r8 652 sub $1, %r8 653 lea 1(%rdi, %rdx), %rdi 654 jnz L(StrncpyFillTailWithZero) 655# endif 656 VZEROUPPER_RETURN 657 658# ifdef USE_AS_STRNCPY 659 660 .p2align 4 661L(StrncpyExit1): 662 movzbl (%rsi), %edx 663 mov %dl, (%rdi) 664# ifdef USE_AS_STPCPY 665 lea 1(%rdi), %rax 666# endif 667# ifdef USE_AS_STRCAT 668 movb $0, 1(%rdi) 669# endif 670 VZEROUPPER_RETURN 671 672 .p2align 4 673L(StrncpyExit2): 674 movzwl (%rsi), %edx 675 mov %dx, (%rdi) 676# ifdef USE_AS_STPCPY 677 lea 2(%rdi), %rax 678# endif 679# ifdef USE_AS_STRCAT 680 movb $0, 2(%rdi) 681# endif 682 VZEROUPPER_RETURN 683 684 .p2align 4 685L(StrncpyExit3_4): 686 movzwl (%rsi), %ecx 687 movzwl -2(%rsi, %r8), %edx 688 mov %cx, (%rdi) 689 mov %dx, -2(%rdi, %r8) 690# ifdef USE_AS_STPCPY 691 lea (%rdi, %r8), %rax 692# endif 693# ifdef USE_AS_STRCAT 694 movb $0, (%rdi, %r8) 695# endif 696 VZEROUPPER_RETURN 697 698 .p2align 4 699L(StrncpyExit5_8): 700 mov (%rsi), %ecx 701 mov -4(%rsi, %r8), %edx 702 mov %ecx, (%rdi) 703 mov %edx, -4(%rdi, %r8) 704# ifdef USE_AS_STPCPY 705 lea (%rdi, %r8), %rax 706# endif 707# ifdef USE_AS_STRCAT 708 movb $0, (%rdi, %r8) 709# endif 710 VZEROUPPER_RETURN 711 712 .p2align 4 713L(StrncpyExit9_16): 714 mov (%rsi), %rcx 715 mov -8(%rsi, %r8), %rdx 716 mov %rcx, (%rdi) 717 mov %rdx, -8(%rdi, %r8) 718# ifdef USE_AS_STPCPY 719 lea (%rdi, %r8), %rax 720# endif 721# ifdef USE_AS_STRCAT 722 movb $0, (%rdi, %r8) 723# endif 724 VZEROUPPER_RETURN 725 726 .p2align 4 727L(StrncpyExit17_32): 728 vmovdqu (%rsi), %xmm2 729 vmovdqu -16(%rsi, %r8), %xmm3 730 vmovdqu %xmm2, (%rdi) 731 vmovdqu %xmm3, -16(%rdi, %r8) 732# ifdef USE_AS_STPCPY 733 lea (%rdi, %r8), %rax 734# endif 735# ifdef USE_AS_STRCAT 736 movb $0, (%rdi, %r8) 737# endif 738 VZEROUPPER_RETURN 739 740 .p2align 4 741L(StrncpyExit33_64): 742 /* 0/32, 31/16 */ 743 vmovdqu (%rsi), %ymm2 744 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 745 vmovdqu %ymm2, (%rdi) 746 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) 747# ifdef USE_AS_STPCPY 748 lea (%rdi, %r8), %rax 749# endif 750# ifdef USE_AS_STRCAT 751 movb $0, (%rdi, %r8) 752# endif 753 VZEROUPPER_RETURN 754 755 .p2align 4 756L(StrncpyExit65): 757 /* 0/32, 32/32, 64/1 */ 758 vmovdqu (%rsi), %ymm2 759 vmovdqu 32(%rsi), %ymm3 760 mov 64(%rsi), %cl 761 vmovdqu %ymm2, (%rdi) 762 vmovdqu %ymm3, 32(%rdi) 763 mov %cl, 64(%rdi) 764# ifdef USE_AS_STPCPY 765 lea 65(%rdi), %rax 766# endif 767# ifdef USE_AS_STRCAT 768 movb $0, 65(%rdi) 769# endif 770 VZEROUPPER_RETURN 771 772# ifndef USE_AS_STRCAT 773 774 .p2align 4 775L(Fill1): 776 mov %dl, (%rdi) 777 VZEROUPPER_RETURN 778 779 .p2align 4 780L(Fill2): 781 mov %dx, (%rdi) 782 VZEROUPPER_RETURN 783 784 .p2align 4 785L(Fill3_4): 786 mov %dx, (%rdi) 787 mov %dx, -2(%rdi, %r8) 788 VZEROUPPER_RETURN 789 790 .p2align 4 791L(Fill5_8): 792 mov %edx, (%rdi) 793 mov %edx, -4(%rdi, %r8) 794 VZEROUPPER_RETURN 795 796 .p2align 4 797L(Fill9_16): 798 mov %rdx, (%rdi) 799 mov %rdx, -8(%rdi, %r8) 800 VZEROUPPER_RETURN 801 802 .p2align 4 803L(Fill17_32): 804 vmovdqu %xmmZ, (%rdi) 805 vmovdqu %xmmZ, -16(%rdi, %r8) 806 VZEROUPPER_RETURN 807 808 .p2align 4 809L(CopyVecSizeUnalignedVec2): 810 vmovdqu %ymm2, (%rdi, %rcx) 811 812 .p2align 4 813L(CopyVecSizeVecExit): 814 bsf %edx, %edx 815 add $(VEC_SIZE - 1), %r8 816 add %rcx, %rdi 817# ifdef USE_AS_STPCPY 818 lea (%rdi, %rdx), %rax 819# endif 820 sub %rdx, %r8 821 lea 1(%rdi, %rdx), %rdi 822 823 .p2align 4 824L(StrncpyFillTailWithZero): 825 xor %edx, %edx 826 sub $VEC_SIZE, %r8 827 jbe L(StrncpyFillExit) 828 829 vmovdqu %ymmZ, (%rdi) 830 add $VEC_SIZE, %rdi 831 832 mov %rdi, %rsi 833 and $(VEC_SIZE - 1), %esi 834 sub %rsi, %rdi 835 add %rsi, %r8 836 sub $(VEC_SIZE * 4), %r8 837 jb L(StrncpyFillLessFourVecSize) 838 839L(StrncpyFillLoopVmovdqa): 840 vmovdqa %ymmZ, (%rdi) 841 vmovdqa %ymmZ, VEC_SIZE(%rdi) 842 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) 843 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) 844 add $(VEC_SIZE * 4), %rdi 845 sub $(VEC_SIZE * 4), %r8 846 jae L(StrncpyFillLoopVmovdqa) 847 848L(StrncpyFillLessFourVecSize): 849 add $(VEC_SIZE * 2), %r8 850 jl L(StrncpyFillLessTwoVecSize) 851 vmovdqa %ymmZ, (%rdi) 852 vmovdqa %ymmZ, VEC_SIZE(%rdi) 853 add $(VEC_SIZE * 2), %rdi 854 sub $VEC_SIZE, %r8 855 jl L(StrncpyFillExit) 856 vmovdqa %ymmZ, (%rdi) 857 add $VEC_SIZE, %rdi 858 jmp L(Fill) 859 860 .p2align 4 861L(StrncpyFillLessTwoVecSize): 862 add $VEC_SIZE, %r8 863 jl L(StrncpyFillExit) 864 vmovdqa %ymmZ, (%rdi) 865 add $VEC_SIZE, %rdi 866 jmp L(Fill) 867 868 .p2align 4 869L(StrncpyFillExit): 870 add $VEC_SIZE, %r8 871L(Fill): 872 cmp $17, %r8d 873 jae L(Fill17_32) 874 cmp $9, %r8d 875 jae L(Fill9_16) 876 cmp $5, %r8d 877 jae L(Fill5_8) 878 cmp $3, %r8d 879 jae L(Fill3_4) 880 cmp $1, %r8d 881 ja L(Fill2) 882 je L(Fill1) 883 VZEROUPPER_RETURN 884 885/* end of ifndef USE_AS_STRCAT */ 886# endif 887 888 .p2align 4 889L(UnalignedLeaveCase2OrCase3): 890 test %rdx, %rdx 891 jnz L(UnalignedFourVecSizeLeaveCase2) 892L(UnalignedFourVecSizeLeaveCase3): 893 lea (VEC_SIZE * 4)(%r8), %rcx 894 and $-VEC_SIZE, %rcx 895 add $(VEC_SIZE * 3), %r8 896 jl L(CopyVecSizeCase3) 897 vmovdqu %ymm4, (%rdi) 898 sub $VEC_SIZE, %r8 899 jb L(CopyVecSizeCase3) 900 vmovdqu %ymm5, VEC_SIZE(%rdi) 901 sub $VEC_SIZE, %r8 902 jb L(CopyVecSizeCase3) 903 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 904 sub $VEC_SIZE, %r8 905 jb L(CopyVecSizeCase3) 906 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) 907# ifdef USE_AS_STPCPY 908 lea (VEC_SIZE * 4)(%rdi), %rax 909# endif 910# ifdef USE_AS_STRCAT 911 movb $0, (VEC_SIZE * 4)(%rdi) 912# endif 913 VZEROUPPER_RETURN 914 915 .p2align 4 916L(UnalignedFourVecSizeLeaveCase2): 917 xor %ecx, %ecx 918 vpcmpeqb %ymm4, %ymmZ, %ymmM 919 vpmovmskb %ymmM, %edx 920 add $(VEC_SIZE * 3), %r8 921 jle L(CopyVecSizeCase2OrCase3) 922 test %edx, %edx 923# ifndef USE_AS_STRCAT 924 jnz L(CopyVecSizeUnalignedVec4) 925# else 926 jnz L(CopyVecSize) 927# endif 928 vpcmpeqb %ymm5, %ymmZ, %ymmM 929 vpmovmskb %ymmM, %edx 930 vmovdqu %ymm4, (%rdi) 931 add $VEC_SIZE, %rcx 932 sub $VEC_SIZE, %r8 933 jbe L(CopyVecSizeCase2OrCase3) 934 test %edx, %edx 935# ifndef USE_AS_STRCAT 936 jnz L(CopyVecSizeUnalignedVec5) 937# else 938 jnz L(CopyVecSize) 939# endif 940 941 vpcmpeqb %ymm6, %ymmZ, %ymmM 942 vpmovmskb %ymmM, %edx 943 vmovdqu %ymm5, VEC_SIZE(%rdi) 944 add $VEC_SIZE, %rcx 945 sub $VEC_SIZE, %r8 946 jbe L(CopyVecSizeCase2OrCase3) 947 test %edx, %edx 948# ifndef USE_AS_STRCAT 949 jnz L(CopyVecSizeUnalignedVec6) 950# else 951 jnz L(CopyVecSize) 952# endif 953 954 vpcmpeqb %ymm7, %ymmZ, %ymmM 955 vpmovmskb %ymmM, %edx 956 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) 957 lea VEC_SIZE(%rdi, %rcx), %rdi 958 lea VEC_SIZE(%rsi, %rcx), %rsi 959 bsf %edx, %edx 960 cmp %r8d, %edx 961 jb L(CopyVecSizeExit) 962L(StrncpyExit): 963 cmp $65, %r8d 964 je L(StrncpyExit65) 965 cmp $33, %r8d 966 jae L(StrncpyExit33_64) 967 cmp $17, %r8d 968 jae L(StrncpyExit17_32) 969 cmp $9, %r8d 970 jae L(StrncpyExit9_16) 971 cmp $5, %r8d 972 jae L(StrncpyExit5_8) 973 cmp $3, %r8d 974 jae L(StrncpyExit3_4) 975 cmp $1, %r8d 976 ja L(StrncpyExit2) 977 je L(StrncpyExit1) 978# ifdef USE_AS_STPCPY 979 mov %rdi, %rax 980# endif 981# ifdef USE_AS_STRCAT 982 movb $0, (%rdi) 983# endif 984 VZEROUPPER_RETURN 985 986 .p2align 4 987L(ExitZero): 988# ifndef USE_AS_STRCAT 989 mov %rdi, %rax 990# endif 991 VZEROUPPER_RETURN 992 993# endif 994 995# ifndef USE_AS_STRCAT 996END (STRCPY) 997# else 998END (STRCAT) 999# endif 1000#endif 1001