1/* memcpy optimized with SSE2 unaligned memory access instructions. 2 Copyright (C) 2014-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) \ 20 && (defined SHARED \ 21 || defined USE_AS_MEMMOVE \ 22 || !defined USE_MULTIARCH) 23 24# include <sysdep.h> 25# include "asm-syntax.h" 26 27# ifndef MEMCPY 28# define MEMCPY __memcpy_sse2_unaligned 29# define MEMCPY_CHK __memcpy_chk_sse2_unaligned 30# endif 31 32# ifdef USE_AS_BCOPY 33# define SRC PARMS 34# define DEST SRC+4 35# define LEN DEST+4 36# else 37# define DEST PARMS 38# define SRC DEST+4 39# define LEN SRC+4 40# endif 41 42# define CFI_PUSH(REG) \ 43 cfi_adjust_cfa_offset (4); \ 44 cfi_rel_offset (REG, 0) 45 46# define CFI_POP(REG) \ 47 cfi_adjust_cfa_offset (-4); \ 48 cfi_restore (REG) 49 50# define PUSH(REG) pushl REG; CFI_PUSH (REG) 51# define POP(REG) popl REG; CFI_POP (REG) 52 53# define PARMS 8 /* Preserve EBX. */ 54# define ENTRANCE PUSH (%ebx); 55# define RETURN_END POP (%ebx); ret 56# define RETURN RETURN_END; CFI_PUSH (%ebx) 57 58 .section .text.sse2,"ax",@progbits 59# if !defined USE_AS_BCOPY && defined SHARED 60ENTRY (MEMCPY_CHK) 61 movl 12(%esp), %eax 62 cmpl %eax, 16(%esp) 63 jb HIDDEN_JUMPTARGET (__chk_fail) 64END (MEMCPY_CHK) 65# endif 66 67ENTRY (MEMCPY) 68 ENTRANCE 69 movl LEN(%esp), %ecx 70 movl SRC(%esp), %eax 71 movl DEST(%esp), %edx 72 cmp %edx, %eax 73 74# ifdef USE_AS_MEMMOVE 75 ja L(check_forward) 76 77L(mm_len_0_or_more_backward): 78/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 79 separately. */ 80 cmp $16, %ecx 81 jbe L(mm_len_0_16_bytes_backward) 82 83 cmpl $32, %ecx 84 ja L(mm_len_32_or_more_backward) 85 86/* Copy [0..32] and return. */ 87 movdqu (%eax), %xmm0 88 movdqu -16(%eax, %ecx), %xmm1 89 movdqu %xmm0, (%edx) 90 movdqu %xmm1, -16(%edx, %ecx) 91 jmp L(return) 92 93L(mm_len_32_or_more_backward): 94 cmpl $64, %ecx 95 ja L(mm_len_64_or_more_backward) 96 97/* Copy [0..64] and return. */ 98 movdqu (%eax), %xmm0 99 movdqu 16(%eax), %xmm1 100 movdqu -16(%eax, %ecx), %xmm2 101 movdqu -32(%eax, %ecx), %xmm3 102 movdqu %xmm0, (%edx) 103 movdqu %xmm1, 16(%edx) 104 movdqu %xmm2, -16(%edx, %ecx) 105 movdqu %xmm3, -32(%edx, %ecx) 106 jmp L(return) 107 108L(mm_len_64_or_more_backward): 109 cmpl $128, %ecx 110 ja L(mm_len_128_or_more_backward) 111 112/* Copy [0..128] and return. */ 113 movdqu (%eax), %xmm0 114 movdqu 16(%eax), %xmm1 115 movdqu 32(%eax), %xmm2 116 movdqu 48(%eax), %xmm3 117 movdqu -64(%eax, %ecx), %xmm4 118 movdqu -48(%eax, %ecx), %xmm5 119 movdqu -32(%eax, %ecx), %xmm6 120 movdqu -16(%eax, %ecx), %xmm7 121 movdqu %xmm0, (%edx) 122 movdqu %xmm1, 16(%edx) 123 movdqu %xmm2, 32(%edx) 124 movdqu %xmm3, 48(%edx) 125 movdqu %xmm4, -64(%edx, %ecx) 126 movdqu %xmm5, -48(%edx, %ecx) 127 movdqu %xmm6, -32(%edx, %ecx) 128 movdqu %xmm7, -16(%edx, %ecx) 129 jmp L(return) 130 131L(mm_len_128_or_more_backward): 132 add %ecx, %eax 133 cmp %edx, %eax 134 movl SRC(%esp), %eax 135 jbe L(forward) 136 PUSH (%esi) 137 PUSH (%edi) 138 PUSH (%ebx) 139 140/* Aligning the address of destination. */ 141 movdqu (%eax), %xmm4 142 movdqu 16(%eax), %xmm5 143 movdqu 32(%eax), %xmm6 144 movdqu 48(%eax), %xmm7 145 leal (%edx, %ecx), %esi 146 movdqu -16(%eax, %ecx), %xmm0 147 subl $16, %esp 148 movdqu %xmm0, (%esp) 149 mov %ecx, %edi 150 movl %esi, %ecx 151 andl $-16, %ecx 152 leal (%ecx), %ebx 153 subl %edx, %ebx 154 leal (%eax, %ebx), %eax 155 shrl $6, %ebx 156 157# ifdef SHARED_CACHE_SIZE_HALF 158 cmp $SHARED_CACHE_SIZE_HALF, %edi 159# else 160# ifdef PIC 161 PUSH (%ebx) 162 SETUP_PIC_REG (bx) 163 add $_GLOBAL_OFFSET_TABLE_, %ebx 164 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi 165 POP (%ebx) 166# else 167 cmp __x86_shared_cache_size_half, %edi 168# endif 169# endif 170 jae L(mm_large_page_loop_backward) 171 172 .p2align 4 173L(mm_main_loop_backward): 174 175 prefetcht0 -128(%eax) 176 177 movdqu -64(%eax), %xmm0 178 movdqu -48(%eax), %xmm1 179 movdqu -32(%eax), %xmm2 180 movdqu -16(%eax), %xmm3 181 movaps %xmm0, -64(%ecx) 182 subl $64, %eax 183 movaps %xmm1, -48(%ecx) 184 movaps %xmm2, -32(%ecx) 185 movaps %xmm3, -16(%ecx) 186 subl $64, %ecx 187 sub $1, %ebx 188 jnz L(mm_main_loop_backward) 189 movdqu (%esp), %xmm0 190 addl $16, %esp 191 movdqu %xmm0, -16(%esi) 192 movdqu %xmm4, (%edx) 193 movdqu %xmm5, 16(%edx) 194 movdqu %xmm6, 32(%edx) 195 movdqu %xmm7, 48(%edx) 196 POP (%ebx) 197 jmp L(mm_return_pop_all) 198 199/* Copy [0..16] and return. */ 200L(mm_len_0_16_bytes_backward): 201 testb $24, %cl 202 jnz L(mm_len_9_16_bytes_backward) 203 testb $4, %cl 204 .p2align 4,,5 205 jnz L(mm_len_5_8_bytes_backward) 206 testl %ecx, %ecx 207 .p2align 4,,2 208 je L(return) 209 testb $2, %cl 210 .p2align 4,,1 211 jne L(mm_len_3_4_bytes_backward) 212 movzbl -1(%eax,%ecx), %ebx 213 movzbl (%eax), %eax 214 movb %bl, -1(%edx,%ecx) 215 movb %al, (%edx) 216 jmp L(return) 217 218L(mm_len_3_4_bytes_backward): 219 movzwl -2(%eax,%ecx), %ebx 220 movzwl (%eax), %eax 221 movw %bx, -2(%edx,%ecx) 222 movw %ax, (%edx) 223 jmp L(return) 224 225L(mm_len_9_16_bytes_backward): 226 PUSH (%esi) 227 movl -4(%eax,%ecx), %ebx 228 movl -8(%eax,%ecx), %esi 229 movl %ebx, -4(%edx,%ecx) 230 movl %esi, -8(%edx,%ecx) 231 subl $8, %ecx 232 POP (%esi) 233 jmp L(mm_len_0_16_bytes_backward) 234 235L(mm_len_5_8_bytes_backward): 236 movl (%eax), %ebx 237 movl -4(%eax,%ecx), %eax 238 movl %ebx, (%edx) 239 movl %eax, -4(%edx,%ecx) 240 jmp L(return) 241 242/* Big length copy backward part. */ 243 .p2align 4 244L(mm_large_page_loop_backward): 245 movdqu -64(%eax), %xmm0 246 movdqu -48(%eax), %xmm1 247 movdqu -32(%eax), %xmm2 248 movdqu -16(%eax), %xmm3 249 movntdq %xmm0, -64(%ecx) 250 subl $64, %eax 251 movntdq %xmm1, -48(%ecx) 252 movntdq %xmm2, -32(%ecx) 253 movntdq %xmm3, -16(%ecx) 254 subl $64, %ecx 255 sub $1, %ebx 256 jnz L(mm_large_page_loop_backward) 257 sfence 258 movdqu (%esp), %xmm0 259 addl $16, %esp 260 movdqu %xmm0, -16(%esi) 261 movdqu %xmm4, (%edx) 262 movdqu %xmm5, 16(%edx) 263 movdqu %xmm6, 32(%edx) 264 movdqu %xmm7, 48(%edx) 265 POP (%ebx) 266 jmp L(mm_return_pop_all) 267 268L(check_forward): 269 add %edx, %ecx 270 cmp %eax, %ecx 271 movl LEN(%esp), %ecx 272 jbe L(forward) 273 274/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 275 separately. */ 276 cmp $16, %ecx 277 jbe L(mm_len_0_16_bytes_forward) 278 279 cmpl $32, %ecx 280 ja L(mm_len_32_or_more_forward) 281 282/* Copy [0..32] and return. */ 283 movdqu (%eax), %xmm0 284 movdqu -16(%eax, %ecx), %xmm1 285 movdqu %xmm0, (%edx) 286 movdqu %xmm1, -16(%edx, %ecx) 287 jmp L(return) 288 289L(mm_len_32_or_more_forward): 290 cmpl $64, %ecx 291 ja L(mm_len_64_or_more_forward) 292 293/* Copy [0..64] and return. */ 294 movdqu (%eax), %xmm0 295 movdqu 16(%eax), %xmm1 296 movdqu -16(%eax, %ecx), %xmm2 297 movdqu -32(%eax, %ecx), %xmm3 298 movdqu %xmm0, (%edx) 299 movdqu %xmm1, 16(%edx) 300 movdqu %xmm2, -16(%edx, %ecx) 301 movdqu %xmm3, -32(%edx, %ecx) 302 jmp L(return) 303 304L(mm_len_64_or_more_forward): 305 cmpl $128, %ecx 306 ja L(mm_len_128_or_more_forward) 307 308/* Copy [0..128] and return. */ 309 movdqu (%eax), %xmm0 310 movdqu 16(%eax), %xmm1 311 movdqu 32(%eax), %xmm2 312 movdqu 48(%eax), %xmm3 313 movdqu -64(%eax, %ecx), %xmm4 314 movdqu -48(%eax, %ecx), %xmm5 315 movdqu -32(%eax, %ecx), %xmm6 316 movdqu -16(%eax, %ecx), %xmm7 317 movdqu %xmm0, (%edx) 318 movdqu %xmm1, 16(%edx) 319 movdqu %xmm2, 32(%edx) 320 movdqu %xmm3, 48(%edx) 321 movdqu %xmm4, -64(%edx, %ecx) 322 movdqu %xmm5, -48(%edx, %ecx) 323 movdqu %xmm6, -32(%edx, %ecx) 324 movdqu %xmm7, -16(%edx, %ecx) 325 jmp L(return) 326 327L(mm_len_128_or_more_forward): 328 PUSH (%esi) 329 PUSH (%edi) 330 PUSH (%ebx) 331 332/* Aligning the address of destination. */ 333 movdqu -16(%eax, %ecx), %xmm4 334 movdqu -32(%eax, %ecx), %xmm5 335 movdqu -48(%eax, %ecx), %xmm6 336 movdqu -64(%eax, %ecx), %xmm7 337 leal (%edx, %ecx), %esi 338 movdqu (%eax), %xmm0 339 subl $16, %esp 340 movdqu %xmm0, (%esp) 341 mov %ecx, %edi 342 leal 16(%edx), %ecx 343 andl $-16, %ecx 344 movl %ecx, %ebx 345 subl %edx, %ebx 346 addl %ebx, %eax 347 movl %esi, %ebx 348 subl %ecx, %ebx 349 shrl $6, %ebx 350 351# ifdef SHARED_CACHE_SIZE_HALF 352 cmp $SHARED_CACHE_SIZE_HALF, %edi 353# else 354# ifdef PIC 355 PUSH (%ebx) 356 SETUP_PIC_REG(bx) 357 add $_GLOBAL_OFFSET_TABLE_, %ebx 358 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi 359 POP (%ebx) 360# else 361 cmp __x86_shared_cache_size_half, %edi 362# endif 363# endif 364 jae L(mm_large_page_loop_forward) 365 366 .p2align 4 367L(mm_main_loop_forward): 368 369 prefetcht0 128(%eax) 370 371 movdqu (%eax), %xmm0 372 movdqu 16(%eax), %xmm1 373 movdqu 32(%eax), %xmm2 374 movdqu 48(%eax), %xmm3 375 movdqa %xmm0, (%ecx) 376 addl $64, %eax 377 movaps %xmm1, 16(%ecx) 378 movaps %xmm2, 32(%ecx) 379 movaps %xmm3, 48(%ecx) 380 addl $64, %ecx 381 sub $1, %ebx 382 jnz L(mm_main_loop_forward) 383 movdqu (%esp), %xmm0 384 addl $16, %esp 385 movdqu %xmm0, (%edx) 386 movdqu %xmm4, -16(%esi) 387 movdqu %xmm5, -32(%esi) 388 movdqu %xmm6, -48(%esi) 389 movdqu %xmm7, -64(%esi) 390 POP (%ebx) 391 jmp L(mm_return_pop_all) 392 393L(mm_len_0_16_bytes_forward): 394 testb $24, %cl 395 jne L(mm_len_9_16_bytes_forward) 396 testb $4, %cl 397 .p2align 4,,5 398 jne L(mm_len_5_8_bytes_forward) 399 testl %ecx, %ecx 400 .p2align 4,,2 401 je L(return) 402 testb $2, %cl 403 .p2align 4,,1 404 jne L(mm_len_2_4_bytes_forward) 405 movzbl -1(%eax,%ecx), %ebx 406 movzbl (%eax), %eax 407 movb %bl, -1(%edx,%ecx) 408 movb %al, (%edx) 409 jmp L(return) 410 411L(mm_len_2_4_bytes_forward): 412 movzwl -2(%eax,%ecx), %ebx 413 movzwl (%eax), %eax 414 movw %bx, -2(%edx,%ecx) 415 movw %ax, (%edx) 416 jmp L(return) 417 418L(mm_len_5_8_bytes_forward): 419 movl (%eax), %ebx 420 movl -4(%eax,%ecx), %eax 421 movl %ebx, (%edx) 422 movl %eax, -4(%edx,%ecx) 423 jmp L(return) 424 425L(mm_len_9_16_bytes_forward): 426 movq (%eax), %xmm0 427 movq -8(%eax, %ecx), %xmm1 428 movq %xmm0, (%edx) 429 movq %xmm1, -8(%edx, %ecx) 430 jmp L(return) 431 432L(mm_return_pop_all): 433 movl %edx, %eax 434 POP (%edi) 435 POP (%esi) 436 RETURN 437 438/* Big length copy forward part. */ 439 .p2align 4 440L(mm_large_page_loop_forward): 441 movdqu (%eax), %xmm0 442 movdqu 16(%eax), %xmm1 443 movdqu 32(%eax), %xmm2 444 movdqu 48(%eax), %xmm3 445 movntdq %xmm0, (%ecx) 446 addl $64, %eax 447 movntdq %xmm1, 16(%ecx) 448 movntdq %xmm2, 32(%ecx) 449 movntdq %xmm3, 48(%ecx) 450 addl $64, %ecx 451 sub $1, %ebx 452 jnz L(mm_large_page_loop_forward) 453 sfence 454 movdqu (%esp), %xmm0 455 addl $16, %esp 456 movdqu %xmm0, (%edx) 457 movdqu %xmm4, -16(%esi) 458 movdqu %xmm5, -32(%esi) 459 movdqu %xmm6, -48(%esi) 460 movdqu %xmm7, -64(%esi) 461 POP (%ebx) 462 jmp L(mm_return_pop_all) 463# endif 464 465L(forward): 466 cmp $16, %ecx 467 jbe L(len_0_16_bytes) 468 469# ifdef SHARED_CACHE_SIZE_HALF 470 cmp $SHARED_CACHE_SIZE_HALF, %ecx 471# else 472# ifdef PIC 473 SETUP_PIC_REG(bx) 474 add $_GLOBAL_OFFSET_TABLE_, %ebx 475 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 476# else 477 cmp __x86_shared_cache_size_half, %ecx 478# endif 479# endif 480 jae L(large_page) 481 482 movdqu (%eax), %xmm0 483 movdqu -16(%eax, %ecx), %xmm1 484 cmpl $32, %ecx 485 movdqu %xmm0, (%edx) 486 movdqu %xmm1, -16(%edx, %ecx) 487 jbe L(return) 488 489 movdqu 16(%eax), %xmm0 490 movdqu -32(%eax, %ecx), %xmm1 491 cmpl $64, %ecx 492 movdqu %xmm0, 16(%edx) 493 movdqu %xmm1, -32(%edx, %ecx) 494 jbe L(return) 495 496 movdqu 32(%eax), %xmm0 497 movdqu 48(%eax), %xmm1 498 movdqu -48(%eax, %ecx), %xmm2 499 movdqu -64(%eax, %ecx), %xmm3 500 cmpl $128, %ecx 501 movdqu %xmm0, 32(%edx) 502 movdqu %xmm1, 48(%edx) 503 movdqu %xmm2, -48(%edx, %ecx) 504 movdqu %xmm3, -64(%edx, %ecx) 505 jbe L(return) 506 507/* Now the main loop: we align the address of the destination. */ 508 leal 64(%edx), %ebx 509 andl $-64, %ebx 510 511 addl %edx, %ecx 512 andl $-64, %ecx 513 514 subl %edx, %eax 515 516/* We should stop two iterations before the termination 517 (in order not to misprefetch). */ 518 subl $64, %ecx 519 cmpl %ebx, %ecx 520 je L(main_loop_just_one_iteration) 521 522 subl $64, %ecx 523 cmpl %ebx, %ecx 524 je L(main_loop_last_two_iterations) 525 526 .p2align 4 527L(main_loop_cache): 528 529 prefetcht0 128(%ebx, %eax) 530 531 movdqu (%ebx, %eax), %xmm0 532 movdqu 16(%ebx, %eax), %xmm1 533 movdqu 32(%ebx, %eax), %xmm2 534 movdqu 48(%ebx, %eax), %xmm3 535 movdqa %xmm0, (%ebx) 536 movaps %xmm1, 16(%ebx) 537 movaps %xmm2, 32(%ebx) 538 movaps %xmm3, 48(%ebx) 539 lea 64(%ebx), %ebx 540 cmpl %ebx, %ecx 541 jne L(main_loop_cache) 542 543L(main_loop_last_two_iterations): 544 movdqu (%ebx, %eax), %xmm0 545 movdqu 16(%ebx, %eax), %xmm1 546 movdqu 32(%ebx, %eax), %xmm2 547 movdqu 48(%ebx, %eax), %xmm3 548 movdqu 64(%ebx, %eax), %xmm4 549 movdqu 80(%ebx, %eax), %xmm5 550 movdqu 96(%ebx, %eax), %xmm6 551 movdqu 112(%ebx, %eax), %xmm7 552 movdqa %xmm0, (%ebx) 553 movaps %xmm1, 16(%ebx) 554 movaps %xmm2, 32(%ebx) 555 movaps %xmm3, 48(%ebx) 556 movaps %xmm4, 64(%ebx) 557 movaps %xmm5, 80(%ebx) 558 movaps %xmm6, 96(%ebx) 559 movaps %xmm7, 112(%ebx) 560 jmp L(return) 561 562L(main_loop_just_one_iteration): 563 movdqu (%ebx, %eax), %xmm0 564 movdqu 16(%ebx, %eax), %xmm1 565 movdqu 32(%ebx, %eax), %xmm2 566 movdqu 48(%ebx, %eax), %xmm3 567 movdqa %xmm0, (%ebx) 568 movaps %xmm1, 16(%ebx) 569 movaps %xmm2, 32(%ebx) 570 movaps %xmm3, 48(%ebx) 571 jmp L(return) 572 573L(large_page): 574 movdqu (%eax), %xmm0 575 movdqu 16(%eax), %xmm1 576 movdqu 32(%eax), %xmm2 577 movdqu 48(%eax), %xmm3 578 movdqu -64(%eax, %ecx), %xmm4 579 movdqu -48(%eax, %ecx), %xmm5 580 movdqu -32(%eax, %ecx), %xmm6 581 movdqu -16(%eax, %ecx), %xmm7 582 movdqu %xmm0, (%edx) 583 movdqu %xmm1, 16(%edx) 584 movdqu %xmm2, 32(%edx) 585 movdqu %xmm3, 48(%edx) 586 movdqu %xmm4, -64(%edx, %ecx) 587 movdqu %xmm5, -48(%edx, %ecx) 588 movdqu %xmm6, -32(%edx, %ecx) 589 movdqu %xmm7, -16(%edx, %ecx) 590 591 movdqu 64(%eax), %xmm0 592 movdqu 80(%eax), %xmm1 593 movdqu 96(%eax), %xmm2 594 movdqu 112(%eax), %xmm3 595 movdqu -128(%eax, %ecx), %xmm4 596 movdqu -112(%eax, %ecx), %xmm5 597 movdqu -96(%eax, %ecx), %xmm6 598 movdqu -80(%eax, %ecx), %xmm7 599 movdqu %xmm0, 64(%edx) 600 movdqu %xmm1, 80(%edx) 601 movdqu %xmm2, 96(%edx) 602 movdqu %xmm3, 112(%edx) 603 movdqu %xmm4, -128(%edx, %ecx) 604 movdqu %xmm5, -112(%edx, %ecx) 605 movdqu %xmm6, -96(%edx, %ecx) 606 movdqu %xmm7, -80(%edx, %ecx) 607 608/* Now the main loop with non temporal stores. We align 609 the address of the destination. */ 610 leal 128(%edx), %ebx 611 andl $-128, %ebx 612 613 addl %edx, %ecx 614 andl $-128, %ecx 615 616 subl %edx, %eax 617 618 .p2align 4 619L(main_loop_large_page): 620 movdqu (%ebx, %eax), %xmm0 621 movdqu 16(%ebx, %eax), %xmm1 622 movdqu 32(%ebx, %eax), %xmm2 623 movdqu 48(%ebx, %eax), %xmm3 624 movdqu 64(%ebx, %eax), %xmm4 625 movdqu 80(%ebx, %eax), %xmm5 626 movdqu 96(%ebx, %eax), %xmm6 627 movdqu 112(%ebx, %eax), %xmm7 628 movntdq %xmm0, (%ebx) 629 movntdq %xmm1, 16(%ebx) 630 movntdq %xmm2, 32(%ebx) 631 movntdq %xmm3, 48(%ebx) 632 movntdq %xmm4, 64(%ebx) 633 movntdq %xmm5, 80(%ebx) 634 movntdq %xmm6, 96(%ebx) 635 movntdq %xmm7, 112(%ebx) 636 lea 128(%ebx), %ebx 637 cmpl %ebx, %ecx 638 jne L(main_loop_large_page) 639 sfence 640 jmp L(return) 641 642L(len_0_16_bytes): 643 testb $24, %cl 644 jne L(len_9_16_bytes) 645 testb $4, %cl 646 .p2align 4,,5 647 jne L(len_5_8_bytes) 648 testl %ecx, %ecx 649 .p2align 4,,2 650 je L(return) 651 movzbl (%eax), %ebx 652 testb $2, %cl 653 movb %bl, (%edx) 654 je L(return) 655 movzwl -2(%eax,%ecx), %ebx 656 movw %bx, -2(%edx,%ecx) 657 jmp L(return) 658 659L(len_9_16_bytes): 660 movq (%eax), %xmm0 661 movq -8(%eax, %ecx), %xmm1 662 movq %xmm0, (%edx) 663 movq %xmm1, -8(%edx, %ecx) 664 jmp L(return) 665 666L(len_5_8_bytes): 667 movl (%eax), %ebx 668 movl %ebx, (%edx) 669 movl -4(%eax,%ecx), %ebx 670 movl %ebx, -4(%edx,%ecx) 671 672L(return): 673 movl %edx, %eax 674# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY 675 movl LEN(%esp), %ecx 676 add %ecx, %eax 677# endif 678 RETURN 679 680END (MEMCPY) 681#endif 682