1/* wcscpy with SSSE3 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20# include <sysdep.h> 21 22 .section .text.ssse3,"ax",@progbits 23ENTRY (__wcscpy_ssse3) 24 25 mov %rsi, %rcx 26 mov %rdi, %rdx 27 28 cmpl $0, (%rcx) 29 jz L(Exit4) 30 cmpl $0, 4(%rcx) 31 jz L(Exit8) 32 cmpl $0, 8(%rcx) 33 jz L(Exit12) 34 cmpl $0, 12(%rcx) 35 jz L(Exit16) 36 37 lea 16(%rcx), %rsi 38 and $-16, %rsi 39 40 pxor %xmm0, %xmm0 41 mov (%rcx), %r9 42 mov %r9, (%rdx) 43 44 pcmpeqd (%rsi), %xmm0 45 mov 8(%rcx), %r9 46 mov %r9, 8(%rdx) 47 48 pmovmskb %xmm0, %rax 49 sub %rcx, %rsi 50 51 test %rax, %rax 52 jnz L(CopyFrom1To16Bytes) 53 54 mov %rdx, %rax 55 lea 16(%rdx), %rdx 56 and $-16, %rdx 57 sub %rdx, %rax 58 sub %rax, %rcx 59 mov %rcx, %rax 60 and $0xf, %rax 61 mov $0, %rsi 62 63/* case: rcx_offset == rdx_offset */ 64 65 jz L(Align16Both) 66 67 cmp $4, %rax 68 je L(Shl4) 69 cmp $8, %rax 70 je L(Shl8) 71 jmp L(Shl12) 72 73L(Align16Both): 74 movaps (%rcx), %xmm1 75 movaps 16(%rcx), %xmm2 76 movaps %xmm1, (%rdx) 77 pcmpeqd %xmm2, %xmm0 78 pmovmskb %xmm0, %rax 79 lea 16(%rsi), %rsi 80 81 test %rax, %rax 82 jnz L(CopyFrom1To16Bytes) 83 84 movaps 16(%rcx, %rsi), %xmm3 85 movaps %xmm2, (%rdx, %rsi) 86 pcmpeqd %xmm3, %xmm0 87 pmovmskb %xmm0, %rax 88 lea 16(%rsi), %rsi 89 90 test %rax, %rax 91 jnz L(CopyFrom1To16Bytes) 92 93 movaps 16(%rcx, %rsi), %xmm4 94 movaps %xmm3, (%rdx, %rsi) 95 pcmpeqd %xmm4, %xmm0 96 pmovmskb %xmm0, %rax 97 lea 16(%rsi), %rsi 98 99 test %rax, %rax 100 jnz L(CopyFrom1To16Bytes) 101 102 movaps 16(%rcx, %rsi), %xmm1 103 movaps %xmm4, (%rdx, %rsi) 104 pcmpeqd %xmm1, %xmm0 105 pmovmskb %xmm0, %rax 106 lea 16(%rsi), %rsi 107 108 test %rax, %rax 109 jnz L(CopyFrom1To16Bytes) 110 111 movaps 16(%rcx, %rsi), %xmm2 112 movaps %xmm1, (%rdx, %rsi) 113 pcmpeqd %xmm2, %xmm0 114 pmovmskb %xmm0, %rax 115 lea 16(%rsi), %rsi 116 117 test %rax, %rax 118 jnz L(CopyFrom1To16Bytes) 119 120 movaps 16(%rcx, %rsi), %xmm3 121 movaps %xmm2, (%rdx, %rsi) 122 pcmpeqd %xmm3, %xmm0 123 pmovmskb %xmm0, %rax 124 lea 16(%rsi), %rsi 125 126 test %rax, %rax 127 jnz L(CopyFrom1To16Bytes) 128 129 movaps %xmm3, (%rdx, %rsi) 130 mov %rcx, %rax 131 lea 16(%rcx, %rsi), %rcx 132 and $-0x40, %rcx 133 sub %rcx, %rax 134 sub %rax, %rdx 135 136 mov $-0x40, %rsi 137 138 .p2align 4 139L(Aligned64Loop): 140 movaps (%rcx), %xmm2 141 movaps %xmm2, %xmm4 142 movaps 16(%rcx), %xmm5 143 movaps 32(%rcx), %xmm3 144 movaps %xmm3, %xmm6 145 movaps 48(%rcx), %xmm7 146 pminub %xmm5, %xmm2 147 pminub %xmm7, %xmm3 148 pminub %xmm2, %xmm3 149 pcmpeqd %xmm0, %xmm3 150 pmovmskb %xmm3, %rax 151 lea 64(%rdx), %rdx 152 lea 64(%rcx), %rcx 153 test %rax, %rax 154 jnz L(Aligned64Leave) 155 movaps %xmm4, -64(%rdx) 156 movaps %xmm5, -48(%rdx) 157 movaps %xmm6, -32(%rdx) 158 movaps %xmm7, -16(%rdx) 159 jmp L(Aligned64Loop) 160 161L(Aligned64Leave): 162 pcmpeqd %xmm4, %xmm0 163 pmovmskb %xmm0, %rax 164 test %rax, %rax 165 jnz L(CopyFrom1To16Bytes) 166 167 pcmpeqd %xmm5, %xmm0 168 169 pmovmskb %xmm0, %rax 170 movaps %xmm4, -64(%rdx) 171 test %rax, %rax 172 lea 16(%rsi), %rsi 173 jnz L(CopyFrom1To16Bytes) 174 175 pcmpeqd %xmm6, %xmm0 176 177 pmovmskb %xmm0, %rax 178 movaps %xmm5, -48(%rdx) 179 test %rax, %rax 180 lea 16(%rsi), %rsi 181 jnz L(CopyFrom1To16Bytes) 182 183 movaps %xmm6, -32(%rdx) 184 pcmpeqd %xmm7, %xmm0 185 186 pmovmskb %xmm0, %rax 187 lea 16(%rsi), %rsi 188 test %rax, %rax 189 jnz L(CopyFrom1To16Bytes) 190 191 mov $-0x40, %rsi 192 movaps %xmm7, -16(%rdx) 193 jmp L(Aligned64Loop) 194 195 .p2align 4 196L(Shl4): 197 movaps -4(%rcx), %xmm1 198 movaps 12(%rcx), %xmm2 199L(Shl4Start): 200 pcmpeqd %xmm2, %xmm0 201 pmovmskb %xmm0, %rax 202 movaps %xmm2, %xmm3 203 204 test %rax, %rax 205 jnz L(Shl4LoopExit) 206 207 palignr $4, %xmm1, %xmm2 208 movaps %xmm2, (%rdx) 209 movaps 28(%rcx), %xmm2 210 211 pcmpeqd %xmm2, %xmm0 212 lea 16(%rdx), %rdx 213 pmovmskb %xmm0, %rax 214 lea 16(%rcx), %rcx 215 movaps %xmm2, %xmm1 216 217 test %rax, %rax 218 jnz L(Shl4LoopExit) 219 220 palignr $4, %xmm3, %xmm2 221 movaps %xmm2, (%rdx) 222 movaps 28(%rcx), %xmm2 223 224 pcmpeqd %xmm2, %xmm0 225 lea 16(%rdx), %rdx 226 pmovmskb %xmm0, %rax 227 lea 16(%rcx), %rcx 228 movaps %xmm2, %xmm3 229 230 test %rax, %rax 231 jnz L(Shl4LoopExit) 232 233 palignr $4, %xmm1, %xmm2 234 movaps %xmm2, (%rdx) 235 movaps 28(%rcx), %xmm2 236 237 pcmpeqd %xmm2, %xmm0 238 lea 16(%rdx), %rdx 239 pmovmskb %xmm0, %rax 240 lea 16(%rcx), %rcx 241 242 test %rax, %rax 243 jnz L(Shl4LoopExit) 244 245 palignr $4, %xmm3, %xmm2 246 movaps %xmm2, (%rdx) 247 lea 28(%rcx), %rcx 248 lea 16(%rdx), %rdx 249 250 mov %rcx, %rax 251 and $-0x40, %rcx 252 sub %rcx, %rax 253 lea -12(%rcx), %rcx 254 sub %rax, %rdx 255 256 movaps -4(%rcx), %xmm1 257 258 .p2align 4 259L(Shl4LoopStart): 260 movaps 12(%rcx), %xmm2 261 movaps 28(%rcx), %xmm3 262 movaps %xmm3, %xmm6 263 movaps 44(%rcx), %xmm4 264 movaps %xmm4, %xmm7 265 movaps 60(%rcx), %xmm5 266 pminub %xmm2, %xmm6 267 pminub %xmm5, %xmm7 268 pminub %xmm6, %xmm7 269 pcmpeqd %xmm0, %xmm7 270 pmovmskb %xmm7, %rax 271 movaps %xmm5, %xmm7 272 palignr $4, %xmm4, %xmm5 273 test %rax, %rax 274 palignr $4, %xmm3, %xmm4 275 jnz L(Shl4Start) 276 277 palignr $4, %xmm2, %xmm3 278 lea 64(%rcx), %rcx 279 palignr $4, %xmm1, %xmm2 280 movaps %xmm7, %xmm1 281 movaps %xmm5, 48(%rdx) 282 movaps %xmm4, 32(%rdx) 283 movaps %xmm3, 16(%rdx) 284 movaps %xmm2, (%rdx) 285 lea 64(%rdx), %rdx 286 jmp L(Shl4LoopStart) 287 288L(Shl4LoopExit): 289 movdqu -4(%rcx), %xmm1 290 mov $12, %rsi 291 movdqu %xmm1, -4(%rdx) 292 jmp L(CopyFrom1To16Bytes) 293 294 .p2align 4 295L(Shl8): 296 movaps -8(%rcx), %xmm1 297 movaps 8(%rcx), %xmm2 298L(Shl8Start): 299 pcmpeqd %xmm2, %xmm0 300 pmovmskb %xmm0, %rax 301 movaps %xmm2, %xmm3 302 303 test %rax, %rax 304 jnz L(Shl8LoopExit) 305 306 palignr $8, %xmm1, %xmm2 307 movaps %xmm2, (%rdx) 308 movaps 24(%rcx), %xmm2 309 310 pcmpeqd %xmm2, %xmm0 311 lea 16(%rdx), %rdx 312 pmovmskb %xmm0, %rax 313 lea 16(%rcx), %rcx 314 movaps %xmm2, %xmm1 315 316 test %rax, %rax 317 jnz L(Shl8LoopExit) 318 319 palignr $8, %xmm3, %xmm2 320 movaps %xmm2, (%rdx) 321 movaps 24(%rcx), %xmm2 322 323 pcmpeqd %xmm2, %xmm0 324 lea 16(%rdx), %rdx 325 pmovmskb %xmm0, %rax 326 lea 16(%rcx), %rcx 327 movaps %xmm2, %xmm3 328 329 test %rax, %rax 330 jnz L(Shl8LoopExit) 331 332 palignr $8, %xmm1, %xmm2 333 movaps %xmm2, (%rdx) 334 movaps 24(%rcx), %xmm2 335 336 pcmpeqd %xmm2, %xmm0 337 lea 16(%rdx), %rdx 338 pmovmskb %xmm0, %rax 339 lea 16(%rcx), %rcx 340 341 test %rax, %rax 342 jnz L(Shl8LoopExit) 343 344 palignr $8, %xmm3, %xmm2 345 movaps %xmm2, (%rdx) 346 lea 24(%rcx), %rcx 347 lea 16(%rdx), %rdx 348 349 mov %rcx, %rax 350 and $-0x40, %rcx 351 sub %rcx, %rax 352 lea -8(%rcx), %rcx 353 sub %rax, %rdx 354 355 movaps -8(%rcx), %xmm1 356 357 .p2align 4 358L(Shl8LoopStart): 359 movaps 8(%rcx), %xmm2 360 movaps 24(%rcx), %xmm3 361 movaps %xmm3, %xmm6 362 movaps 40(%rcx), %xmm4 363 movaps %xmm4, %xmm7 364 movaps 56(%rcx), %xmm5 365 pminub %xmm2, %xmm6 366 pminub %xmm5, %xmm7 367 pminub %xmm6, %xmm7 368 pcmpeqd %xmm0, %xmm7 369 pmovmskb %xmm7, %rax 370 movaps %xmm5, %xmm7 371 palignr $8, %xmm4, %xmm5 372 test %rax, %rax 373 palignr $8, %xmm3, %xmm4 374 jnz L(Shl8Start) 375 376 palignr $8, %xmm2, %xmm3 377 lea 64(%rcx), %rcx 378 palignr $8, %xmm1, %xmm2 379 movaps %xmm7, %xmm1 380 movaps %xmm5, 48(%rdx) 381 movaps %xmm4, 32(%rdx) 382 movaps %xmm3, 16(%rdx) 383 movaps %xmm2, (%rdx) 384 lea 64(%rdx), %rdx 385 jmp L(Shl8LoopStart) 386 387L(Shl8LoopExit): 388 mov (%rcx), %r9 389 mov $8, %rsi 390 mov %r9, (%rdx) 391 jmp L(CopyFrom1To16Bytes) 392 393 .p2align 4 394L(Shl12): 395 movaps -12(%rcx), %xmm1 396 movaps 4(%rcx), %xmm2 397L(Shl12Start): 398 pcmpeqd %xmm2, %xmm0 399 pmovmskb %xmm0, %rax 400 movaps %xmm2, %xmm3 401 402 test %rax, %rax 403 jnz L(Shl12LoopExit) 404 405 palignr $12, %xmm1, %xmm2 406 movaps %xmm2, (%rdx) 407 movaps 20(%rcx), %xmm2 408 409 pcmpeqd %xmm2, %xmm0 410 lea 16(%rdx), %rdx 411 pmovmskb %xmm0, %rax 412 lea 16(%rcx), %rcx 413 movaps %xmm2, %xmm1 414 415 test %rax, %rax 416 jnz L(Shl12LoopExit) 417 418 palignr $12, %xmm3, %xmm2 419 movaps %xmm2, (%rdx) 420 movaps 20(%rcx), %xmm2 421 422 pcmpeqd %xmm2, %xmm0 423 lea 16(%rdx), %rdx 424 pmovmskb %xmm0, %rax 425 lea 16(%rcx), %rcx 426 movaps %xmm2, %xmm3 427 428 test %rax, %rax 429 jnz L(Shl12LoopExit) 430 431 palignr $12, %xmm1, %xmm2 432 movaps %xmm2, (%rdx) 433 movaps 20(%rcx), %xmm2 434 435 pcmpeqd %xmm2, %xmm0 436 lea 16(%rdx), %rdx 437 pmovmskb %xmm0, %rax 438 lea 16(%rcx), %rcx 439 440 test %rax, %rax 441 jnz L(Shl12LoopExit) 442 443 palignr $12, %xmm3, %xmm2 444 movaps %xmm2, (%rdx) 445 lea 20(%rcx), %rcx 446 lea 16(%rdx), %rdx 447 448 mov %rcx, %rax 449 and $-0x40, %rcx 450 sub %rcx, %rax 451 lea -4(%rcx), %rcx 452 sub %rax, %rdx 453 454 movaps -12(%rcx), %xmm1 455 456 .p2align 4 457L(Shl12LoopStart): 458 movaps 4(%rcx), %xmm2 459 movaps 20(%rcx), %xmm3 460 movaps %xmm3, %xmm6 461 movaps 36(%rcx), %xmm4 462 movaps %xmm4, %xmm7 463 movaps 52(%rcx), %xmm5 464 pminub %xmm2, %xmm6 465 pminub %xmm5, %xmm7 466 pminub %xmm6, %xmm7 467 pcmpeqd %xmm0, %xmm7 468 pmovmskb %xmm7, %rax 469 movaps %xmm5, %xmm7 470 palignr $12, %xmm4, %xmm5 471 test %rax, %rax 472 palignr $12, %xmm3, %xmm4 473 jnz L(Shl12Start) 474 palignr $12, %xmm2, %xmm3 475 lea 64(%rcx), %rcx 476 palignr $12, %xmm1, %xmm2 477 movaps %xmm7, %xmm1 478 movaps %xmm5, 48(%rdx) 479 movaps %xmm4, 32(%rdx) 480 movaps %xmm3, 16(%rdx) 481 movaps %xmm2, (%rdx) 482 lea 64(%rdx), %rdx 483 jmp L(Shl12LoopStart) 484 485L(Shl12LoopExit): 486 mov (%rcx), %r9d 487 mov $4, %rsi 488 mov %r9d, (%rdx) 489 jmp L(CopyFrom1To16Bytes) 490 491 .p2align 4 492L(CopyFrom1To16Bytes): 493 add %rsi, %rdx 494 add %rsi, %rcx 495 496 test %al, %al 497 jz L(ExitHigh) 498 test $0x01, %al 499 jnz L(Exit4) 500 501 mov (%rcx), %rax 502 mov %rax, (%rdx) 503 mov %rdi, %rax 504 ret 505 506 .p2align 4 507L(ExitHigh): 508 test $0x01, %ah 509 jnz L(Exit12) 510 511 mov (%rcx), %rax 512 mov %rax, (%rdx) 513 mov 8(%rcx), %rax 514 mov %rax, 8(%rdx) 515 mov %rdi, %rax 516 ret 517 518 .p2align 4 519L(Exit4): 520 movl (%rcx), %eax 521 movl %eax, (%rdx) 522 mov %rdi, %rax 523 ret 524 525 .p2align 4 526L(Exit8): 527 mov (%rcx), %rax 528 mov %rax, (%rdx) 529 mov %rdi, %rax 530 ret 531 532 .p2align 4 533L(Exit12): 534 mov (%rcx), %rax 535 mov %rax, (%rdx) 536 mov 8(%rcx), %eax 537 mov %eax, 8(%rdx) 538 mov %rdi, %rax 539 ret 540 541 .p2align 4 542L(Exit16): 543 mov (%rcx), %rax 544 mov %rax, (%rdx) 545 mov 8(%rcx), %rax 546 mov %rax, 8(%rdx) 547 mov %rdi, %rax 548 ret 549 550END(__wcscpy_ssse3) 551#endif 552