1/* strcpy with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# ifndef USE_AS_STRCAT 22# include <sysdep.h> 23 24# ifndef STRCPY 25# define STRCPY __strcpy_evex 26# endif 27 28# endif 29 30# define VMOVU vmovdqu64 31# define VMOVA vmovdqa64 32 33/* Number of bytes in a vector register */ 34# ifndef VEC_SIZE 35# define VEC_SIZE 32 36# endif 37 38# define XMM2 xmm18 39# define XMM3 xmm19 40 41# define YMM2 ymm18 42# define YMM3 ymm19 43# define YMM4 ymm20 44# define YMM5 ymm21 45# define YMM6 ymm22 46# define YMM7 ymm23 47 48# ifndef USE_AS_STRCAT 49 50/* zero register */ 51# define XMMZERO xmm16 52# define YMMZERO ymm16 53# define YMM1 ymm17 54 55 .section .text.evex,"ax",@progbits 56ENTRY (STRCPY) 57# ifdef USE_AS_STRNCPY 58 mov %RDX_LP, %R8_LP 59 test %R8_LP, %R8_LP 60 jz L(ExitZero) 61# endif 62 mov %rsi, %rcx 63# ifndef USE_AS_STPCPY 64 mov %rdi, %rax /* save result */ 65# endif 66 67 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 68# endif 69 70 and $((VEC_SIZE * 4) - 1), %ecx 71 cmp $(VEC_SIZE * 2), %ecx 72 jbe L(SourceStringAlignmentLessTwoVecSize) 73 74 and $-VEC_SIZE, %rsi 75 and $(VEC_SIZE - 1), %ecx 76 77 vpcmpb $0, (%rsi), %YMMZERO, %k0 78 kmovd %k0, %edx 79 shr %cl, %rdx 80 81# ifdef USE_AS_STRNCPY 82# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 83 mov $VEC_SIZE, %r10 84 sub %rcx, %r10 85 cmp %r10, %r8 86# else 87 mov $(VEC_SIZE + 1), %r10 88 sub %rcx, %r10 89 cmp %r10, %r8 90# endif 91 jbe L(CopyVecSizeTailCase2OrCase3) 92# endif 93 test %edx, %edx 94 jnz L(CopyVecSizeTail) 95 96 vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 97 kmovd %k1, %edx 98 99# ifdef USE_AS_STRNCPY 100 add $VEC_SIZE, %r10 101 cmp %r10, %r8 102 jbe L(CopyTwoVecSizeCase2OrCase3) 103# endif 104 test %edx, %edx 105 jnz L(CopyTwoVecSize) 106 107 VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ 108 VMOVU %YMM2, (%rdi) 109 110/* If source address alignment != destination address alignment */ 111 .p2align 4 112L(UnalignVecSizeBoth): 113 sub %rcx, %rdi 114# ifdef USE_AS_STRNCPY 115 add %rcx, %r8 116 sbb %rcx, %rcx 117 or %rcx, %r8 118# endif 119 mov $VEC_SIZE, %rcx 120 VMOVA (%rsi, %rcx), %YMM2 121 VMOVU %YMM2, (%rdi, %rcx) 122 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 123 vpcmpb $0, %YMM2, %YMMZERO, %k0 124 kmovd %k0, %edx 125 add $VEC_SIZE, %rcx 126# ifdef USE_AS_STRNCPY 127 sub $(VEC_SIZE * 3), %r8 128 jbe L(CopyVecSizeCase2OrCase3) 129# endif 130 test %edx, %edx 131# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 132 jnz L(CopyVecSizeUnalignedVec2) 133# else 134 jnz L(CopyVecSize) 135# endif 136 137 VMOVU %YMM2, (%rdi, %rcx) 138 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 139 vpcmpb $0, %YMM3, %YMMZERO, %k0 140 kmovd %k0, %edx 141 add $VEC_SIZE, %rcx 142# ifdef USE_AS_STRNCPY 143 sub $VEC_SIZE, %r8 144 jbe L(CopyVecSizeCase2OrCase3) 145# endif 146 test %edx, %edx 147# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 148 jnz L(CopyVecSizeUnalignedVec3) 149# else 150 jnz L(CopyVecSize) 151# endif 152 153 VMOVU %YMM3, (%rdi, %rcx) 154 VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 155 vpcmpb $0, %YMM4, %YMMZERO, %k0 156 kmovd %k0, %edx 157 add $VEC_SIZE, %rcx 158# ifdef USE_AS_STRNCPY 159 sub $VEC_SIZE, %r8 160 jbe L(CopyVecSizeCase2OrCase3) 161# endif 162 test %edx, %edx 163# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 164 jnz L(CopyVecSizeUnalignedVec4) 165# else 166 jnz L(CopyVecSize) 167# endif 168 169 VMOVU %YMM4, (%rdi, %rcx) 170 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 171 vpcmpb $0, %YMM2, %YMMZERO, %k0 172 kmovd %k0, %edx 173 add $VEC_SIZE, %rcx 174# ifdef USE_AS_STRNCPY 175 sub $VEC_SIZE, %r8 176 jbe L(CopyVecSizeCase2OrCase3) 177# endif 178 test %edx, %edx 179# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 180 jnz L(CopyVecSizeUnalignedVec2) 181# else 182 jnz L(CopyVecSize) 183# endif 184 185 VMOVU %YMM2, (%rdi, %rcx) 186 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 187 vpcmpb $0, %YMM2, %YMMZERO, %k0 188 kmovd %k0, %edx 189 add $VEC_SIZE, %rcx 190# ifdef USE_AS_STRNCPY 191 sub $VEC_SIZE, %r8 192 jbe L(CopyVecSizeCase2OrCase3) 193# endif 194 test %edx, %edx 195# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 196 jnz L(CopyVecSizeUnalignedVec2) 197# else 198 jnz L(CopyVecSize) 199# endif 200 201 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 202 VMOVU %YMM2, (%rdi, %rcx) 203 vpcmpb $0, %YMM3, %YMMZERO, %k0 204 kmovd %k0, %edx 205 add $VEC_SIZE, %rcx 206# ifdef USE_AS_STRNCPY 207 sub $VEC_SIZE, %r8 208 jbe L(CopyVecSizeCase2OrCase3) 209# endif 210 test %edx, %edx 211# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 212 jnz L(CopyVecSizeUnalignedVec3) 213# else 214 jnz L(CopyVecSize) 215# endif 216 217 VMOVU %YMM3, (%rdi, %rcx) 218 mov %rsi, %rdx 219 lea VEC_SIZE(%rsi, %rcx), %rsi 220 and $-(VEC_SIZE * 4), %rsi 221 sub %rsi, %rdx 222 sub %rdx, %rdi 223# ifdef USE_AS_STRNCPY 224 lea (VEC_SIZE * 8)(%r8, %rdx), %r8 225# endif 226L(UnalignedFourVecSizeLoop): 227 VMOVA (%rsi), %YMM4 228 VMOVA VEC_SIZE(%rsi), %YMM5 229 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 230 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 231 vpminub %YMM5, %YMM4, %YMM2 232 vpminub %YMM7, %YMM6, %YMM3 233 vpminub %YMM2, %YMM3, %YMM2 234 /* If K7 != 0, there is a null byte. */ 235 vpcmpb $0, %YMM2, %YMMZERO, %k7 236 kmovd %k7, %edx 237# ifdef USE_AS_STRNCPY 238 sub $(VEC_SIZE * 4), %r8 239 jbe L(UnalignedLeaveCase2OrCase3) 240# endif 241 test %edx, %edx 242 jnz L(UnalignedFourVecSizeLeave) 243 244L(UnalignedFourVecSizeLoop_start): 245 add $(VEC_SIZE * 4), %rdi 246 add $(VEC_SIZE * 4), %rsi 247 VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) 248 VMOVA (%rsi), %YMM4 249 VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) 250 VMOVA VEC_SIZE(%rsi), %YMM5 251 vpminub %YMM5, %YMM4, %YMM2 252 VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) 253 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 254 VMOVU %YMM7, -VEC_SIZE(%rdi) 255 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 256 vpminub %YMM7, %YMM6, %YMM3 257 vpminub %YMM2, %YMM3, %YMM2 258 /* If K7 != 0, there is a null byte. */ 259 vpcmpb $0, %YMM2, %YMMZERO, %k7 260 kmovd %k7, %edx 261# ifdef USE_AS_STRNCPY 262 sub $(VEC_SIZE * 4), %r8 263 jbe L(UnalignedLeaveCase2OrCase3) 264# endif 265 test %edx, %edx 266 jz L(UnalignedFourVecSizeLoop_start) 267 268L(UnalignedFourVecSizeLeave): 269 vpcmpb $0, %YMM4, %YMMZERO, %k1 270 kmovd %k1, %edx 271 test %edx, %edx 272 jnz L(CopyVecSizeUnaligned_0) 273 274 vpcmpb $0, %YMM5, %YMMZERO, %k2 275 kmovd %k2, %ecx 276 test %ecx, %ecx 277 jnz L(CopyVecSizeUnaligned_16) 278 279 vpcmpb $0, %YMM6, %YMMZERO, %k3 280 kmovd %k3, %edx 281 test %edx, %edx 282 jnz L(CopyVecSizeUnaligned_32) 283 284 vpcmpb $0, %YMM7, %YMMZERO, %k4 285 kmovd %k4, %ecx 286 bsf %ecx, %edx 287 VMOVU %YMM4, (%rdi) 288 VMOVU %YMM5, VEC_SIZE(%rdi) 289 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 290# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 291# ifdef USE_AS_STPCPY 292 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax 293# endif 294 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) 295 add $(VEC_SIZE - 1), %r8 296 sub %rdx, %r8 297 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi 298 jmp L(StrncpyFillTailWithZero) 299# else 300 add $(VEC_SIZE * 3), %rsi 301 add $(VEC_SIZE * 3), %rdi 302 jmp L(CopyVecSizeExit) 303# endif 304 305/* If source address alignment == destination address alignment */ 306 307L(SourceStringAlignmentLessTwoVecSize): 308 VMOVU (%rsi), %YMM3 309 VMOVU VEC_SIZE(%rsi), %YMM2 310 vpcmpb $0, %YMM3, %YMMZERO, %k0 311 kmovd %k0, %edx 312 313# ifdef USE_AS_STRNCPY 314# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 315 cmp $VEC_SIZE, %r8 316# else 317 cmp $(VEC_SIZE + 1), %r8 318# endif 319 jbe L(CopyVecSizeTail1Case2OrCase3) 320# endif 321 test %edx, %edx 322 jnz L(CopyVecSizeTail1) 323 324 VMOVU %YMM3, (%rdi) 325 vpcmpb $0, %YMM2, %YMMZERO, %k0 326 kmovd %k0, %edx 327 328# ifdef USE_AS_STRNCPY 329# if defined USE_AS_STPCPY || defined USE_AS_STRCAT 330 cmp $(VEC_SIZE * 2), %r8 331# else 332 cmp $((VEC_SIZE * 2) + 1), %r8 333# endif 334 jbe L(CopyTwoVecSize1Case2OrCase3) 335# endif 336 test %edx, %edx 337 jnz L(CopyTwoVecSize1) 338 339 and $-VEC_SIZE, %rsi 340 and $(VEC_SIZE - 1), %ecx 341 jmp L(UnalignVecSizeBoth) 342 343/*------End of main part with loops---------------------*/ 344 345/* Case1 */ 346 347# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) 348 .p2align 4 349L(CopyVecSize): 350 add %rcx, %rdi 351# endif 352L(CopyVecSizeTail): 353 add %rcx, %rsi 354L(CopyVecSizeTail1): 355 bsf %edx, %edx 356L(CopyVecSizeExit): 357 cmp $32, %edx 358 jae L(Exit32_63) 359 cmp $16, %edx 360 jae L(Exit16_31) 361 cmp $8, %edx 362 jae L(Exit8_15) 363 cmp $4, %edx 364 jae L(Exit4_7) 365 cmp $3, %edx 366 je L(Exit3) 367 cmp $1, %edx 368 ja L(Exit2) 369 je L(Exit1) 370 movb $0, (%rdi) 371# ifdef USE_AS_STPCPY 372 lea (%rdi), %rax 373# endif 374# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 375 sub $1, %r8 376 lea 1(%rdi), %rdi 377 jnz L(StrncpyFillTailWithZero) 378# endif 379 ret 380 381 .p2align 4 382L(CopyTwoVecSize1): 383 add $VEC_SIZE, %rsi 384 add $VEC_SIZE, %rdi 385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 386 sub $VEC_SIZE, %r8 387# endif 388 jmp L(CopyVecSizeTail1) 389 390 .p2align 4 391L(CopyTwoVecSize): 392 bsf %edx, %edx 393 add %rcx, %rsi 394 add $VEC_SIZE, %edx 395 sub %ecx, %edx 396 jmp L(CopyVecSizeExit) 397 398 .p2align 4 399L(CopyVecSizeUnaligned_0): 400 bsf %edx, %edx 401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 402# ifdef USE_AS_STPCPY 403 lea (%rdi, %rdx), %rax 404# endif 405 VMOVU %YMM4, (%rdi) 406 add $((VEC_SIZE * 4) - 1), %r8 407 sub %rdx, %r8 408 lea 1(%rdi, %rdx), %rdi 409 jmp L(StrncpyFillTailWithZero) 410# else 411 jmp L(CopyVecSizeExit) 412# endif 413 414 .p2align 4 415L(CopyVecSizeUnaligned_16): 416 bsf %ecx, %edx 417 VMOVU %YMM4, (%rdi) 418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 419# ifdef USE_AS_STPCPY 420 lea VEC_SIZE(%rdi, %rdx), %rax 421# endif 422 VMOVU %YMM5, VEC_SIZE(%rdi) 423 add $((VEC_SIZE * 3) - 1), %r8 424 sub %rdx, %r8 425 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi 426 jmp L(StrncpyFillTailWithZero) 427# else 428 add $VEC_SIZE, %rsi 429 add $VEC_SIZE, %rdi 430 jmp L(CopyVecSizeExit) 431# endif 432 433 .p2align 4 434L(CopyVecSizeUnaligned_32): 435 bsf %edx, %edx 436 VMOVU %YMM4, (%rdi) 437 VMOVU %YMM5, VEC_SIZE(%rdi) 438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 439# ifdef USE_AS_STPCPY 440 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax 441# endif 442 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 443 add $((VEC_SIZE * 2) - 1), %r8 444 sub %rdx, %r8 445 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi 446 jmp L(StrncpyFillTailWithZero) 447# else 448 add $(VEC_SIZE * 2), %rsi 449 add $(VEC_SIZE * 2), %rdi 450 jmp L(CopyVecSizeExit) 451# endif 452 453# ifdef USE_AS_STRNCPY 454# ifndef USE_AS_STRCAT 455 .p2align 4 456L(CopyVecSizeUnalignedVec6): 457 VMOVU %YMM6, (%rdi, %rcx) 458 jmp L(CopyVecSizeVecExit) 459 460 .p2align 4 461L(CopyVecSizeUnalignedVec5): 462 VMOVU %YMM5, (%rdi, %rcx) 463 jmp L(CopyVecSizeVecExit) 464 465 .p2align 4 466L(CopyVecSizeUnalignedVec4): 467 VMOVU %YMM4, (%rdi, %rcx) 468 jmp L(CopyVecSizeVecExit) 469 470 .p2align 4 471L(CopyVecSizeUnalignedVec3): 472 VMOVU %YMM3, (%rdi, %rcx) 473 jmp L(CopyVecSizeVecExit) 474# endif 475 476/* Case2 */ 477 478 .p2align 4 479L(CopyVecSizeCase2): 480 add $VEC_SIZE, %r8 481 add %rcx, %rdi 482 add %rcx, %rsi 483 bsf %edx, %edx 484 cmp %r8d, %edx 485 jb L(CopyVecSizeExit) 486 jmp L(StrncpyExit) 487 488 .p2align 4 489L(CopyTwoVecSizeCase2): 490 add %rcx, %rsi 491 bsf %edx, %edx 492 add $VEC_SIZE, %edx 493 sub %ecx, %edx 494 cmp %r8d, %edx 495 jb L(CopyVecSizeExit) 496 jmp L(StrncpyExit) 497 498L(CopyVecSizeTailCase2): 499 add %rcx, %rsi 500 bsf %edx, %edx 501 cmp %r8d, %edx 502 jb L(CopyVecSizeExit) 503 jmp L(StrncpyExit) 504 505L(CopyVecSizeTail1Case2): 506 bsf %edx, %edx 507 cmp %r8d, %edx 508 jb L(CopyVecSizeExit) 509 jmp L(StrncpyExit) 510 511/* Case2 or Case3, Case3 */ 512 513 .p2align 4 514L(CopyVecSizeCase2OrCase3): 515 test %rdx, %rdx 516 jnz L(CopyVecSizeCase2) 517L(CopyVecSizeCase3): 518 add $VEC_SIZE, %r8 519 add %rcx, %rdi 520 add %rcx, %rsi 521 jmp L(StrncpyExit) 522 523 .p2align 4 524L(CopyTwoVecSizeCase2OrCase3): 525 test %rdx, %rdx 526 jnz L(CopyTwoVecSizeCase2) 527 add %rcx, %rsi 528 jmp L(StrncpyExit) 529 530 .p2align 4 531L(CopyVecSizeTailCase2OrCase3): 532 test %rdx, %rdx 533 jnz L(CopyVecSizeTailCase2) 534 add %rcx, %rsi 535 jmp L(StrncpyExit) 536 537 .p2align 4 538L(CopyTwoVecSize1Case2OrCase3): 539 add $VEC_SIZE, %rdi 540 add $VEC_SIZE, %rsi 541 sub $VEC_SIZE, %r8 542L(CopyVecSizeTail1Case2OrCase3): 543 test %rdx, %rdx 544 jnz L(CopyVecSizeTail1Case2) 545 jmp L(StrncpyExit) 546# endif 547 548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ 549 550 .p2align 4 551L(Exit1): 552 movzwl (%rsi), %edx 553 mov %dx, (%rdi) 554# ifdef USE_AS_STPCPY 555 lea 1(%rdi), %rax 556# endif 557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 558 sub $2, %r8 559 lea 2(%rdi), %rdi 560 jnz L(StrncpyFillTailWithZero) 561# endif 562 ret 563 564 .p2align 4 565L(Exit2): 566 movzwl (%rsi), %ecx 567 mov %cx, (%rdi) 568 movb $0, 2(%rdi) 569# ifdef USE_AS_STPCPY 570 lea 2(%rdi), %rax 571# endif 572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 573 sub $3, %r8 574 lea 3(%rdi), %rdi 575 jnz L(StrncpyFillTailWithZero) 576# endif 577 ret 578 579 .p2align 4 580L(Exit3): 581 mov (%rsi), %edx 582 mov %edx, (%rdi) 583# ifdef USE_AS_STPCPY 584 lea 3(%rdi), %rax 585# endif 586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 587 sub $4, %r8 588 lea 4(%rdi), %rdi 589 jnz L(StrncpyFillTailWithZero) 590# endif 591 ret 592 593 .p2align 4 594L(Exit4_7): 595 mov (%rsi), %ecx 596 mov %ecx, (%rdi) 597 mov -3(%rsi, %rdx), %ecx 598 mov %ecx, -3(%rdi, %rdx) 599# ifdef USE_AS_STPCPY 600 lea (%rdi, %rdx), %rax 601# endif 602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 603 sub %rdx, %r8 604 sub $1, %r8 605 lea 1(%rdi, %rdx), %rdi 606 jnz L(StrncpyFillTailWithZero) 607# endif 608 ret 609 610 .p2align 4 611L(Exit8_15): 612 mov (%rsi), %rcx 613 mov -7(%rsi, %rdx), %r9 614 mov %rcx, (%rdi) 615 mov %r9, -7(%rdi, %rdx) 616# ifdef USE_AS_STPCPY 617 lea (%rdi, %rdx), %rax 618# endif 619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 620 sub %rdx, %r8 621 sub $1, %r8 622 lea 1(%rdi, %rdx), %rdi 623 jnz L(StrncpyFillTailWithZero) 624# endif 625 ret 626 627 .p2align 4 628L(Exit16_31): 629 VMOVU (%rsi), %XMM2 630 VMOVU -15(%rsi, %rdx), %XMM3 631 VMOVU %XMM2, (%rdi) 632 VMOVU %XMM3, -15(%rdi, %rdx) 633# ifdef USE_AS_STPCPY 634 lea (%rdi, %rdx), %rax 635# endif 636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 637 sub %rdx, %r8 638 sub $1, %r8 639 lea 1(%rdi, %rdx), %rdi 640 jnz L(StrncpyFillTailWithZero) 641# endif 642 ret 643 644 .p2align 4 645L(Exit32_63): 646 VMOVU (%rsi), %YMM2 647 VMOVU -31(%rsi, %rdx), %YMM3 648 VMOVU %YMM2, (%rdi) 649 VMOVU %YMM3, -31(%rdi, %rdx) 650# ifdef USE_AS_STPCPY 651 lea (%rdi, %rdx), %rax 652# endif 653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT 654 sub %rdx, %r8 655 sub $1, %r8 656 lea 1(%rdi, %rdx), %rdi 657 jnz L(StrncpyFillTailWithZero) 658# endif 659 ret 660 661# ifdef USE_AS_STRNCPY 662 663 .p2align 4 664L(StrncpyExit1): 665 movzbl (%rsi), %edx 666 mov %dl, (%rdi) 667# ifdef USE_AS_STPCPY 668 lea 1(%rdi), %rax 669# endif 670# ifdef USE_AS_STRCAT 671 movb $0, 1(%rdi) 672# endif 673 ret 674 675 .p2align 4 676L(StrncpyExit2): 677 movzwl (%rsi), %edx 678 mov %dx, (%rdi) 679# ifdef USE_AS_STPCPY 680 lea 2(%rdi), %rax 681# endif 682# ifdef USE_AS_STRCAT 683 movb $0, 2(%rdi) 684# endif 685 ret 686 687 .p2align 4 688L(StrncpyExit3_4): 689 movzwl (%rsi), %ecx 690 movzwl -2(%rsi, %r8), %edx 691 mov %cx, (%rdi) 692 mov %dx, -2(%rdi, %r8) 693# ifdef USE_AS_STPCPY 694 lea (%rdi, %r8), %rax 695# endif 696# ifdef USE_AS_STRCAT 697 movb $0, (%rdi, %r8) 698# endif 699 ret 700 701 .p2align 4 702L(StrncpyExit5_8): 703 mov (%rsi), %ecx 704 mov -4(%rsi, %r8), %edx 705 mov %ecx, (%rdi) 706 mov %edx, -4(%rdi, %r8) 707# ifdef USE_AS_STPCPY 708 lea (%rdi, %r8), %rax 709# endif 710# ifdef USE_AS_STRCAT 711 movb $0, (%rdi, %r8) 712# endif 713 ret 714 715 .p2align 4 716L(StrncpyExit9_16): 717 mov (%rsi), %rcx 718 mov -8(%rsi, %r8), %rdx 719 mov %rcx, (%rdi) 720 mov %rdx, -8(%rdi, %r8) 721# ifdef USE_AS_STPCPY 722 lea (%rdi, %r8), %rax 723# endif 724# ifdef USE_AS_STRCAT 725 movb $0, (%rdi, %r8) 726# endif 727 ret 728 729 .p2align 4 730L(StrncpyExit17_32): 731 VMOVU (%rsi), %XMM2 732 VMOVU -16(%rsi, %r8), %XMM3 733 VMOVU %XMM2, (%rdi) 734 VMOVU %XMM3, -16(%rdi, %r8) 735# ifdef USE_AS_STPCPY 736 lea (%rdi, %r8), %rax 737# endif 738# ifdef USE_AS_STRCAT 739 movb $0, (%rdi, %r8) 740# endif 741 ret 742 743 .p2align 4 744L(StrncpyExit33_64): 745 /* 0/32, 31/16 */ 746 VMOVU (%rsi), %YMM2 747 VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 748 VMOVU %YMM2, (%rdi) 749 VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) 750# ifdef USE_AS_STPCPY 751 lea (%rdi, %r8), %rax 752# endif 753# ifdef USE_AS_STRCAT 754 movb $0, (%rdi, %r8) 755# endif 756 ret 757 758 .p2align 4 759L(StrncpyExit65): 760 /* 0/32, 32/32, 64/1 */ 761 VMOVU (%rsi), %YMM2 762 VMOVU 32(%rsi), %YMM3 763 mov 64(%rsi), %cl 764 VMOVU %YMM2, (%rdi) 765 VMOVU %YMM3, 32(%rdi) 766 mov %cl, 64(%rdi) 767# ifdef USE_AS_STPCPY 768 lea 65(%rdi), %rax 769# endif 770# ifdef USE_AS_STRCAT 771 movb $0, 65(%rdi) 772# endif 773 ret 774 775# ifndef USE_AS_STRCAT 776 777 .p2align 4 778L(Fill1): 779 mov %dl, (%rdi) 780 ret 781 782 .p2align 4 783L(Fill2): 784 mov %dx, (%rdi) 785 ret 786 787 .p2align 4 788L(Fill3_4): 789 mov %dx, (%rdi) 790 mov %dx, -2(%rdi, %r8) 791 ret 792 793 .p2align 4 794L(Fill5_8): 795 mov %edx, (%rdi) 796 mov %edx, -4(%rdi, %r8) 797 ret 798 799 .p2align 4 800L(Fill9_16): 801 mov %rdx, (%rdi) 802 mov %rdx, -8(%rdi, %r8) 803 ret 804 805 .p2align 4 806L(Fill17_32): 807 VMOVU %XMMZERO, (%rdi) 808 VMOVU %XMMZERO, -16(%rdi, %r8) 809 ret 810 811 .p2align 4 812L(CopyVecSizeUnalignedVec2): 813 VMOVU %YMM2, (%rdi, %rcx) 814 815 .p2align 4 816L(CopyVecSizeVecExit): 817 bsf %edx, %edx 818 add $(VEC_SIZE - 1), %r8 819 add %rcx, %rdi 820# ifdef USE_AS_STPCPY 821 lea (%rdi, %rdx), %rax 822# endif 823 sub %rdx, %r8 824 lea 1(%rdi, %rdx), %rdi 825 826 .p2align 4 827L(StrncpyFillTailWithZero): 828 xor %edx, %edx 829 sub $VEC_SIZE, %r8 830 jbe L(StrncpyFillExit) 831 832 VMOVU %YMMZERO, (%rdi) 833 add $VEC_SIZE, %rdi 834 835 mov %rdi, %rsi 836 and $(VEC_SIZE - 1), %esi 837 sub %rsi, %rdi 838 add %rsi, %r8 839 sub $(VEC_SIZE * 4), %r8 840 jb L(StrncpyFillLessFourVecSize) 841 842L(StrncpyFillLoopVmovdqa): 843 VMOVA %YMMZERO, (%rdi) 844 VMOVA %YMMZERO, VEC_SIZE(%rdi) 845 VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) 846 VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) 847 add $(VEC_SIZE * 4), %rdi 848 sub $(VEC_SIZE * 4), %r8 849 jae L(StrncpyFillLoopVmovdqa) 850 851L(StrncpyFillLessFourVecSize): 852 add $(VEC_SIZE * 2), %r8 853 jl L(StrncpyFillLessTwoVecSize) 854 VMOVA %YMMZERO, (%rdi) 855 VMOVA %YMMZERO, VEC_SIZE(%rdi) 856 add $(VEC_SIZE * 2), %rdi 857 sub $VEC_SIZE, %r8 858 jl L(StrncpyFillExit) 859 VMOVA %YMMZERO, (%rdi) 860 add $VEC_SIZE, %rdi 861 jmp L(Fill) 862 863 .p2align 4 864L(StrncpyFillLessTwoVecSize): 865 add $VEC_SIZE, %r8 866 jl L(StrncpyFillExit) 867 VMOVA %YMMZERO, (%rdi) 868 add $VEC_SIZE, %rdi 869 jmp L(Fill) 870 871 .p2align 4 872L(StrncpyFillExit): 873 add $VEC_SIZE, %r8 874L(Fill): 875 cmp $17, %r8d 876 jae L(Fill17_32) 877 cmp $9, %r8d 878 jae L(Fill9_16) 879 cmp $5, %r8d 880 jae L(Fill5_8) 881 cmp $3, %r8d 882 jae L(Fill3_4) 883 cmp $1, %r8d 884 ja L(Fill2) 885 je L(Fill1) 886 ret 887 888/* end of ifndef USE_AS_STRCAT */ 889# endif 890 891 .p2align 4 892L(UnalignedLeaveCase2OrCase3): 893 test %rdx, %rdx 894 jnz L(UnalignedFourVecSizeLeaveCase2) 895L(UnalignedFourVecSizeLeaveCase3): 896 lea (VEC_SIZE * 4)(%r8), %rcx 897 and $-VEC_SIZE, %rcx 898 add $(VEC_SIZE * 3), %r8 899 jl L(CopyVecSizeCase3) 900 VMOVU %YMM4, (%rdi) 901 sub $VEC_SIZE, %r8 902 jb L(CopyVecSizeCase3) 903 VMOVU %YMM5, VEC_SIZE(%rdi) 904 sub $VEC_SIZE, %r8 905 jb L(CopyVecSizeCase3) 906 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 907 sub $VEC_SIZE, %r8 908 jb L(CopyVecSizeCase3) 909 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) 910# ifdef USE_AS_STPCPY 911 lea (VEC_SIZE * 4)(%rdi), %rax 912# endif 913# ifdef USE_AS_STRCAT 914 movb $0, (VEC_SIZE * 4)(%rdi) 915# endif 916 ret 917 918 .p2align 4 919L(UnalignedFourVecSizeLeaveCase2): 920 xor %ecx, %ecx 921 vpcmpb $0, %YMM4, %YMMZERO, %k1 922 kmovd %k1, %edx 923 add $(VEC_SIZE * 3), %r8 924 jle L(CopyVecSizeCase2OrCase3) 925 test %edx, %edx 926# ifndef USE_AS_STRCAT 927 jnz L(CopyVecSizeUnalignedVec4) 928# else 929 jnz L(CopyVecSize) 930# endif 931 vpcmpb $0, %YMM5, %YMMZERO, %k2 932 kmovd %k2, %edx 933 VMOVU %YMM4, (%rdi) 934 add $VEC_SIZE, %rcx 935 sub $VEC_SIZE, %r8 936 jbe L(CopyVecSizeCase2OrCase3) 937 test %edx, %edx 938# ifndef USE_AS_STRCAT 939 jnz L(CopyVecSizeUnalignedVec5) 940# else 941 jnz L(CopyVecSize) 942# endif 943 944 vpcmpb $0, %YMM6, %YMMZERO, %k3 945 kmovd %k3, %edx 946 VMOVU %YMM5, VEC_SIZE(%rdi) 947 add $VEC_SIZE, %rcx 948 sub $VEC_SIZE, %r8 949 jbe L(CopyVecSizeCase2OrCase3) 950 test %edx, %edx 951# ifndef USE_AS_STRCAT 952 jnz L(CopyVecSizeUnalignedVec6) 953# else 954 jnz L(CopyVecSize) 955# endif 956 957 vpcmpb $0, %YMM7, %YMMZERO, %k4 958 kmovd %k4, %edx 959 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) 960 lea VEC_SIZE(%rdi, %rcx), %rdi 961 lea VEC_SIZE(%rsi, %rcx), %rsi 962 bsf %edx, %edx 963 cmp %r8d, %edx 964 jb L(CopyVecSizeExit) 965L(StrncpyExit): 966 cmp $65, %r8d 967 je L(StrncpyExit65) 968 cmp $33, %r8d 969 jae L(StrncpyExit33_64) 970 cmp $17, %r8d 971 jae L(StrncpyExit17_32) 972 cmp $9, %r8d 973 jae L(StrncpyExit9_16) 974 cmp $5, %r8d 975 jae L(StrncpyExit5_8) 976 cmp $3, %r8d 977 jae L(StrncpyExit3_4) 978 cmp $1, %r8d 979 ja L(StrncpyExit2) 980 je L(StrncpyExit1) 981# ifdef USE_AS_STPCPY 982 mov %rdi, %rax 983# endif 984# ifdef USE_AS_STRCAT 985 movb $0, (%rdi) 986# endif 987 ret 988 989 .p2align 4 990L(ExitZero): 991# ifndef USE_AS_STRCAT 992 mov %rdi, %rax 993# endif 994 ret 995 996# endif 997 998# ifndef USE_AS_STRCAT 999END (STRCPY) 1000# else 1001END (STRCAT) 1002# endif 1003#endif 1004