1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb 2 Copyright (C) 2016-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* memmove/memcpy/mempcpy is implemented as: 20 1. Use overlapping load and store to avoid branch. 21 2. Load all sources into registers and store them together to avoid 22 possible address overlap between source and destination. 23 3. If size is 8 * VEC_SIZE or less, load all sources into registers 24 and store them together. 25 4. If address of destination > address of source, backward copy 26 4 * VEC_SIZE at a time with unaligned load and aligned store. 27 Load the first 4 * VEC and last VEC before the loop and store 28 them after the loop to support overlapping addresses. 29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned 30 load and aligned store. Load the last 4 * VEC and first VEC 31 before the loop and store them after the loop to support 32 overlapping addresses. 33 6. On machines with ERMS feature, if size greater than equal or to 34 __x86_rep_movsb_threshold and less than 35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. 36 7. If size >= __x86_shared_non_temporal_threshold and there is no 37 overlap between destination and source, use non-temporal store 38 instead of aligned store copying from either 2 or 4 pages at 39 once. 40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold 41 and source and destination do not page alias, copy from 2 pages 42 at once using non-temporal stores. Page aliasing in this case is 43 considered true if destination's page alignment - sources' page 44 alignment is less than 8 * VEC_SIZE. 45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source 46 and destination do page alias copy from 4 pages at once using 47 non-temporal stores. */ 48 49#include <sysdep.h> 50 51#ifndef MEMCPY_SYMBOL 52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 53#endif 54 55#ifndef MEMPCPY_SYMBOL 56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 57#endif 58 59#ifndef MEMMOVE_CHK_SYMBOL 60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) 61#endif 62 63#ifndef XMM0 64# define XMM0 xmm0 65#endif 66 67#ifndef YMM0 68# define YMM0 ymm0 69#endif 70 71#ifndef VZEROUPPER 72# if VEC_SIZE > 16 73# define VZEROUPPER vzeroupper 74# else 75# define VZEROUPPER 76# endif 77#endif 78 79/* Whether to align before movsb. Ultimately we want 64 byte 80 align and not worth it to load 4x VEC for VEC_SIZE == 16. */ 81#define ALIGN_MOVSB (VEC_SIZE > 16) 82/* Number of bytes to align movsb to. */ 83#define MOVSB_ALIGN_TO 64 84 85#define SMALL_MOV_SIZE (MOV_SIZE <= 4) 86#define LARGE_MOV_SIZE (MOV_SIZE > 4) 87 88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 89# error MOV_SIZE Unknown 90#endif 91 92#if LARGE_MOV_SIZE 93# define SMALL_SIZE_OFFSET (4) 94#else 95# define SMALL_SIZE_OFFSET (0) 96#endif 97 98#ifndef PAGE_SIZE 99# define PAGE_SIZE 4096 100#endif 101 102#if PAGE_SIZE != 4096 103# error Unsupported PAGE_SIZE 104#endif 105 106#ifndef LOG_PAGE_SIZE 107# define LOG_PAGE_SIZE 12 108#endif 109 110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) 111# error Invalid LOG_PAGE_SIZE 112#endif 113 114/* Byte per page for large_memcpy inner loop. */ 115#if VEC_SIZE == 64 116# define LARGE_LOAD_SIZE (VEC_SIZE * 2) 117#else 118# define LARGE_LOAD_SIZE (VEC_SIZE * 4) 119#endif 120 121/* Amount to shift rdx by to compare for memcpy_large_4x. */ 122#ifndef LOG_4X_MEMCPY_THRESH 123# define LOG_4X_MEMCPY_THRESH 4 124#endif 125 126/* Avoid short distance rep movsb only with non-SSE vector. */ 127#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB 128# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) 129#else 130# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 131#endif 132 133#ifndef PREFETCH 134# define PREFETCH(addr) prefetcht0 addr 135#endif 136 137/* Assume 64-byte prefetch size. */ 138#ifndef PREFETCH_SIZE 139# define PREFETCH_SIZE 64 140#endif 141 142#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) 143 144#if PREFETCH_SIZE == 64 145# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE 146# define PREFETCH_ONE_SET(dir, base, offset) \ 147 PREFETCH ((offset)base) 148# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE 149# define PREFETCH_ONE_SET(dir, base, offset) \ 150 PREFETCH ((offset)base); \ 151 PREFETCH ((offset + dir * PREFETCH_SIZE)base) 152# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE 153# define PREFETCH_ONE_SET(dir, base, offset) \ 154 PREFETCH ((offset)base); \ 155 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ 156 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ 157 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) 158# else 159# error Unsupported PREFETCHED_LOAD_SIZE! 160# endif 161#else 162# error Unsupported PREFETCH_SIZE! 163#endif 164 165#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) 166# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ 167 VMOVU (offset)base, vec0; \ 168 VMOVU ((offset) + VEC_SIZE)base, vec1; 169# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ 170 VMOVNT vec0, (offset)base; \ 171 VMOVNT vec1, ((offset) + VEC_SIZE)base; 172#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) 173# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ 174 VMOVU (offset)base, vec0; \ 175 VMOVU ((offset) + VEC_SIZE)base, vec1; \ 176 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ 177 VMOVU ((offset) + VEC_SIZE * 3)base, vec3; 178# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ 179 VMOVNT vec0, (offset)base; \ 180 VMOVNT vec1, ((offset) + VEC_SIZE)base; \ 181 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ 182 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; 183#else 184# error Invalid LARGE_LOAD_SIZE 185#endif 186 187#ifndef SECTION 188# error SECTION is not defined! 189#endif 190 191 .section SECTION(.text),"ax",@progbits 192#if defined SHARED && IS_IN (libc) 193ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) 194 cmp %RDX_LP, %RCX_LP 195 jb HIDDEN_JUMPTARGET (__chk_fail) 196END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) 197#endif 198 199ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) 200 mov %RDI_LP, %RAX_LP 201 add %RDX_LP, %RAX_LP 202 jmp L(start) 203END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) 204 205#if defined SHARED && IS_IN (libc) 206ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) 207 cmp %RDX_LP, %RCX_LP 208 jb HIDDEN_JUMPTARGET (__chk_fail) 209END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) 210#endif 211 212ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) 213 movq %rdi, %rax 214L(start): 215# ifdef __ILP32__ 216 /* Clear the upper 32 bits. */ 217 movl %edx, %edx 218# endif 219 cmp $VEC_SIZE, %RDX_LP 220 jb L(less_vec) 221 /* Load regardless. */ 222 VMOVU (%rsi), %VEC(0) 223 cmp $(VEC_SIZE * 2), %RDX_LP 224 ja L(more_2x_vec) 225 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ 226 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) 227 VMOVU %VEC(0), (%rdi) 228 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) 229#if !(defined USE_MULTIARCH && IS_IN (libc)) 230 ZERO_UPPER_VEC_REGISTERS_RETURN 231#else 232 VZEROUPPER_RETURN 233#endif 234#if defined USE_MULTIARCH && IS_IN (libc) 235END (MEMMOVE_SYMBOL (__memmove, unaligned)) 236# if VEC_SIZE == 16 237ENTRY (__mempcpy_chk_erms) 238 cmp %RDX_LP, %RCX_LP 239 jb HIDDEN_JUMPTARGET (__chk_fail) 240END (__mempcpy_chk_erms) 241 242/* Only used to measure performance of REP MOVSB. */ 243ENTRY (__mempcpy_erms) 244 mov %RDI_LP, %RAX_LP 245 /* Skip zero length. */ 246 test %RDX_LP, %RDX_LP 247 jz 2f 248 add %RDX_LP, %RAX_LP 249 jmp L(start_movsb) 250END (__mempcpy_erms) 251 252ENTRY (__memmove_chk_erms) 253 cmp %RDX_LP, %RCX_LP 254 jb HIDDEN_JUMPTARGET (__chk_fail) 255END (__memmove_chk_erms) 256 257ENTRY (__memmove_erms) 258 movq %rdi, %rax 259 /* Skip zero length. */ 260 test %RDX_LP, %RDX_LP 261 jz 2f 262L(start_movsb): 263 mov %RDX_LP, %RCX_LP 264 cmp %RSI_LP, %RDI_LP 265 jb 1f 266 /* Source == destination is less common. */ 267 je 2f 268 lea (%rsi,%rcx), %RDX_LP 269 cmp %RDX_LP, %RDI_LP 270 jb L(movsb_backward) 2711: 272 rep movsb 2732: 274 ret 275L(movsb_backward): 276 leaq -1(%rdi,%rcx), %rdi 277 leaq -1(%rsi,%rcx), %rsi 278 std 279 rep movsb 280 cld 281 ret 282END (__memmove_erms) 283strong_alias (__memmove_erms, __memcpy_erms) 284strong_alias (__memmove_chk_erms, __memcpy_chk_erms) 285# endif 286 287# ifdef SHARED 288ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) 289 cmp %RDX_LP, %RCX_LP 290 jb HIDDEN_JUMPTARGET (__chk_fail) 291END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) 292# endif 293 294ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) 295 mov %RDI_LP, %RAX_LP 296 add %RDX_LP, %RAX_LP 297 jmp L(start_erms) 298END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) 299 300# ifdef SHARED 301ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) 302 cmp %RDX_LP, %RCX_LP 303 jb HIDDEN_JUMPTARGET (__chk_fail) 304END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) 305# endif 306 307ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) 308 movq %rdi, %rax 309L(start_erms): 310# ifdef __ILP32__ 311 /* Clear the upper 32 bits. */ 312 movl %edx, %edx 313# endif 314 cmp $VEC_SIZE, %RDX_LP 315 jb L(less_vec) 316 /* Load regardless. */ 317 VMOVU (%rsi), %VEC(0) 318 cmp $(VEC_SIZE * 2), %RDX_LP 319 ja L(movsb_more_2x_vec) 320 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. 321 */ 322 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) 323 VMOVU %VEC(0), (%rdi) 324 VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) 325L(return): 326# if VEC_SIZE > 16 327 ZERO_UPPER_VEC_REGISTERS_RETURN 328# else 329 ret 330# endif 331#endif 332 333#if LARGE_MOV_SIZE 334 /* If LARGE_MOV_SIZE this fits in the aligning bytes between the 335 ENTRY block and L(less_vec). */ 336 .p2align 4,, 8 337L(between_4_7): 338 /* From 4 to 7. No branch when size == 4. */ 339 movl (%rsi), %ecx 340 movl (%rsi, %rdx), %esi 341 movl %ecx, (%rdi) 342 movl %esi, (%rdi, %rdx) 343 ret 344#endif 345 346 .p2align 4 347L(less_vec): 348 /* Less than 1 VEC. */ 349#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 350# error Unsupported VEC_SIZE! 351#endif 352#if VEC_SIZE > 32 353 cmpl $32, %edx 354 jae L(between_32_63) 355#endif 356#if VEC_SIZE > 16 357 cmpl $16, %edx 358 jae L(between_16_31) 359#endif 360 cmpl $8, %edx 361 jae L(between_8_15) 362#if SMALL_MOV_SIZE 363 cmpl $4, %edx 364#else 365 subq $4, %rdx 366#endif 367 jae L(between_4_7) 368 cmpl $(1 - SMALL_SIZE_OFFSET), %edx 369 jl L(copy_0) 370 movb (%rsi), %cl 371 je L(copy_1) 372 movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi 373 movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) 374L(copy_1): 375 movb %cl, (%rdi) 376L(copy_0): 377 ret 378 379#if SMALL_MOV_SIZE 380 .p2align 4,, 8 381L(between_4_7): 382 /* From 4 to 7. No branch when size == 4. */ 383 movl -4(%rsi, %rdx), %ecx 384 movl (%rsi), %esi 385 movl %ecx, -4(%rdi, %rdx) 386 movl %esi, (%rdi) 387 ret 388#endif 389 390#if VEC_SIZE > 16 391 /* From 16 to 31. No branch when size == 16. */ 392 .p2align 4,, 8 393L(between_16_31): 394 vmovdqu (%rsi), %xmm0 395 vmovdqu -16(%rsi, %rdx), %xmm1 396 vmovdqu %xmm0, (%rdi) 397 vmovdqu %xmm1, -16(%rdi, %rdx) 398 /* No ymm registers have been touched. */ 399 ret 400#endif 401 402#if VEC_SIZE > 32 403 .p2align 4,, 10 404L(between_32_63): 405 /* From 32 to 63. No branch when size == 32. */ 406 VMOVU (%rsi), %YMM0 407 VMOVU -32(%rsi, %rdx), %YMM1 408 VMOVU %YMM0, (%rdi) 409 VMOVU %YMM1, -32(%rdi, %rdx) 410 VZEROUPPER_RETURN 411#endif 412 413 .p2align 4,, 10 414L(between_8_15): 415 /* From 8 to 15. No branch when size == 8. */ 416 movq -8(%rsi, %rdx), %rcx 417 movq (%rsi), %rsi 418 movq %rsi, (%rdi) 419 movq %rcx, -8(%rdi, %rdx) 420 ret 421 422 .p2align 4,, 10 423L(last_4x_vec): 424 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ 425 426 /* VEC(0) and VEC(1) have already been loaded. */ 427 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) 428 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) 429 VMOVU %VEC(0), (%rdi) 430 VMOVU %VEC(1), VEC_SIZE(%rdi) 431 VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) 432 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) 433 VZEROUPPER_RETURN 434 435 .p2align 4 436#if defined USE_MULTIARCH && IS_IN (libc) 437L(movsb_more_2x_vec): 438 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP 439 ja L(movsb) 440#endif 441L(more_2x_vec): 442 /* More than 2 * VEC and there may be overlap between 443 destination and source. */ 444 cmpq $(VEC_SIZE * 8), %rdx 445 ja L(more_8x_vec) 446 /* Load VEC(1) regardless. VEC(0) has already been loaded. */ 447 VMOVU VEC_SIZE(%rsi), %VEC(1) 448 cmpq $(VEC_SIZE * 4), %rdx 449 jbe L(last_4x_vec) 450 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ 451 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 452 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 453 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) 454 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) 455 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) 456 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) 457 VMOVU %VEC(0), (%rdi) 458 VMOVU %VEC(1), VEC_SIZE(%rdi) 459 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) 460 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) 461 VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) 462 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) 463 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) 464 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) 465 VZEROUPPER_RETURN 466 467 .p2align 4,, 4 468L(more_8x_vec): 469 movq %rdi, %rcx 470 subq %rsi, %rcx 471 /* Go to backwards temporal copy if overlap no matter what as 472 backward REP MOVSB is slow and we don't want to use NT stores if 473 there is overlap. */ 474 cmpq %rdx, %rcx 475 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ 476 jb L(more_8x_vec_backward_check_nop) 477 /* Check if non-temporal move candidate. */ 478#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) 479 /* Check non-temporal store threshold. */ 480 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP 481 ja L(large_memcpy_2x) 482#endif 483 /* To reach this point there cannot be overlap and dst > src. So 484 check for overlap and src > dst in which case correctness 485 requires forward copy. Otherwise decide between backward/forward 486 copy depending on address aliasing. */ 487 488 /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold 489 but less than __x86_shared_non_temporal_threshold. */ 490L(more_8x_vec_check): 491 /* rcx contains dst - src. Add back length (rdx). */ 492 leaq (%rcx, %rdx), %r8 493 /* If r8 has different sign than rcx then there is overlap so we 494 must do forward copy. */ 495 xorq %rcx, %r8 496 /* Isolate just sign bit of r8. */ 497 shrq $63, %r8 498 /* Get 4k difference dst - src. */ 499 andl $(PAGE_SIZE - 256), %ecx 500 /* If r8 is non-zero must do foward for correctness. Otherwise 501 if ecx is non-zero there is 4k False Alaising so do backward 502 copy. */ 503 addl %r8d, %ecx 504 jz L(more_8x_vec_backward) 505 506 /* if rdx is greater than __x86_shared_non_temporal_threshold 507 but there is overlap, or from short distance movsb. */ 508L(more_8x_vec_forward): 509 /* Load first and last 4 * VEC to support overlapping addresses. 510 */ 511 512 /* First vec was already loaded into VEC(0). */ 513 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) 514 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) 515 /* Save begining of dst. */ 516 movq %rdi, %rcx 517 /* Align dst to VEC_SIZE - 1. */ 518 orq $(VEC_SIZE - 1), %rdi 519 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) 520 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) 521 522 /* Subtract dst from src. Add back after dst aligned. */ 523 subq %rcx, %rsi 524 /* Finish aligning dst. */ 525 incq %rdi 526 /* Restore src adjusted with new value for aligned dst. */ 527 addq %rdi, %rsi 528 /* Store end of buffer minus tail in rdx. */ 529 leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx 530 531 /* Dont use multi-byte nop to align. */ 532 .p2align 4,, 11 533L(loop_4x_vec_forward): 534 /* Copy 4 * VEC a time forward. */ 535 VMOVU (%rsi), %VEC(1) 536 VMOVU VEC_SIZE(%rsi), %VEC(2) 537 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) 538 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) 539 subq $-(VEC_SIZE * 4), %rsi 540 VMOVA %VEC(1), (%rdi) 541 VMOVA %VEC(2), VEC_SIZE(%rdi) 542 VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) 543 VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) 544 subq $-(VEC_SIZE * 4), %rdi 545 cmpq %rdi, %rdx 546 ja L(loop_4x_vec_forward) 547 /* Store the last 4 * VEC. */ 548 VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) 549 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) 550 VMOVU %VEC(7), VEC_SIZE(%rdx) 551 VMOVU %VEC(8), (%rdx) 552 /* Store the first VEC. */ 553 VMOVU %VEC(0), (%rcx) 554 /* Keep L(nop_backward) target close to jmp for 2-byte encoding. 555 */ 556L(nop_backward): 557 VZEROUPPER_RETURN 558 559 .p2align 4,, 8 560L(more_8x_vec_backward_check_nop): 561 /* rcx contains dst - src. Test for dst == src to skip all of 562 memmove. */ 563 testq %rcx, %rcx 564 jz L(nop_backward) 565L(more_8x_vec_backward): 566 /* Load the first 4 * VEC and last VEC to support overlapping 567 addresses. */ 568 569 /* First vec was also loaded into VEC(0). */ 570 VMOVU VEC_SIZE(%rsi), %VEC(5) 571 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) 572 /* Begining of region for 4x backward copy stored in rcx. */ 573 leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx 574 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) 575 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) 576 /* Subtract dst from src. Add back after dst aligned. */ 577 subq %rdi, %rsi 578 /* Align dst. */ 579 andq $-(VEC_SIZE), %rcx 580 /* Restore src. */ 581 addq %rcx, %rsi 582 583 /* Don't use multi-byte nop to align. */ 584 .p2align 4,, 11 585L(loop_4x_vec_backward): 586 /* Copy 4 * VEC a time backward. */ 587 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) 588 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 589 VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) 590 VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) 591 addq $(VEC_SIZE * -4), %rsi 592 VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) 593 VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) 594 VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) 595 VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) 596 addq $(VEC_SIZE * -4), %rcx 597 cmpq %rcx, %rdi 598 jb L(loop_4x_vec_backward) 599 /* Store the first 4 * VEC. */ 600 VMOVU %VEC(0), (%rdi) 601 VMOVU %VEC(5), VEC_SIZE(%rdi) 602 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) 603 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) 604 /* Store the last VEC. */ 605 VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) 606 VZEROUPPER_RETURN 607 608#if defined USE_MULTIARCH && IS_IN (libc) 609 /* L(skip_short_movsb_check) is only used with ERMS. Not for 610 FSRM. */ 611 .p2align 5,, 16 612# if ALIGN_MOVSB 613L(skip_short_movsb_check): 614# if MOVSB_ALIGN_TO > VEC_SIZE 615 VMOVU VEC_SIZE(%rsi), %VEC(1) 616# endif 617# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) 618# error Unsupported MOVSB_ALIGN_TO 619# endif 620 /* If CPU does not have FSRM two options for aligning. Align src 621 if dst and src 4k alias. Otherwise align dst. */ 622 testl $(PAGE_SIZE - 512), %ecx 623 jnz L(movsb_align_dst) 624 /* Fall through. dst and src 4k alias. It's better to align src 625 here because the bottleneck will be loads dues to the false 626 dependency on dst. */ 627 628 /* rcx already has dst - src. */ 629 movq %rcx, %r9 630 /* Add src to len. Subtract back after src aligned. -1 because 631 src is initially aligned to MOVSB_ALIGN_TO - 1. */ 632 leaq -1(%rsi, %rdx), %rcx 633 /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ 634 orq $(MOVSB_ALIGN_TO - 1), %rsi 635 /* Restore dst and len adjusted with new values for aligned dst. 636 */ 637 leaq 1(%rsi, %r9), %rdi 638 subq %rsi, %rcx 639 /* Finish aligning src. */ 640 incq %rsi 641 642 rep movsb 643 644 VMOVU %VEC(0), (%r8) 645# if MOVSB_ALIGN_TO > VEC_SIZE 646 VMOVU %VEC(1), VEC_SIZE(%r8) 647# endif 648 VZEROUPPER_RETURN 649# endif 650 651 .p2align 4,, 12 652L(movsb): 653 movq %rdi, %rcx 654 subq %rsi, %rcx 655 /* Go to backwards temporal copy if overlap no matter what as 656 backward REP MOVSB is slow and we don't want to use NT stores if 657 there is overlap. */ 658 cmpq %rdx, %rcx 659 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ 660 jb L(more_8x_vec_backward_check_nop) 661# if ALIGN_MOVSB 662 /* Save dest for storing aligning VECs later. */ 663 movq %rdi, %r8 664# endif 665 /* If above __x86_rep_movsb_stop_threshold most likely is 666 candidate for NT moves aswell. */ 667 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP 668 jae L(large_memcpy_2x_check) 669# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB 670 /* Only avoid short movsb if CPU has FSRM. */ 671 testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) 672 jz L(skip_short_movsb_check) 673# if AVOID_SHORT_DISTANCE_REP_MOVSB 674 /* Avoid "rep movsb" if RCX, the distance between source and 675 destination, is N*4GB + [1..63] with N >= 0. */ 676 677 /* ecx contains dst - src. Early check for backward copy 678 conditions means only case of slow movsb with src = dst + [0, 679 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check 680 for that case. */ 681 cmpl $-64, %ecx 682 ja L(more_8x_vec_forward) 683# endif 684# endif 685# if ALIGN_MOVSB 686# if MOVSB_ALIGN_TO > VEC_SIZE 687 VMOVU VEC_SIZE(%rsi), %VEC(1) 688# endif 689# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) 690# error Unsupported MOVSB_ALIGN_TO 691# endif 692 /* Fall through means cpu has FSRM. In that case exclusively 693 align destination. */ 694L(movsb_align_dst): 695 /* Subtract dst from src. Add back after dst aligned. */ 696 subq %rdi, %rsi 697 /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ 698 addq $(MOVSB_ALIGN_TO - 1), %rdi 699 /* Add dst to len. Subtract back after dst aligned. */ 700 leaq (%r8, %rdx), %rcx 701 /* Finish aligning dst. */ 702 andq $-(MOVSB_ALIGN_TO), %rdi 703 /* Restore src and len adjusted with new values for aligned dst. 704 */ 705 addq %rdi, %rsi 706 subq %rdi, %rcx 707 708 rep movsb 709 710 /* Store VECs loaded for aligning. */ 711 VMOVU %VEC(0), (%r8) 712# if MOVSB_ALIGN_TO > VEC_SIZE 713 VMOVU %VEC(1), VEC_SIZE(%r8) 714# endif 715 VZEROUPPER_RETURN 716# else /* !ALIGN_MOVSB. */ 717L(skip_short_movsb_check): 718 mov %RDX_LP, %RCX_LP 719 rep movsb 720 ret 721# endif 722#endif 723 724 .p2align 4,, 10 725#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) 726L(large_memcpy_2x_check): 727 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP 728 jb L(more_8x_vec_check) 729L(large_memcpy_2x): 730 /* To reach this point it is impossible for dst > src and 731 overlap. Remaining to check is src > dst and overlap. rcx 732 already contains dst - src. Negate rcx to get src - dst. If 733 length > rcx then there is overlap and forward copy is best. */ 734 negq %rcx 735 cmpq %rcx, %rdx 736 ja L(more_8x_vec_forward) 737 738 /* Cache align destination. First store the first 64 bytes then 739 adjust alignments. */ 740 741 /* First vec was also loaded into VEC(0). */ 742# if VEC_SIZE < 64 743 VMOVU VEC_SIZE(%rsi), %VEC(1) 744# if VEC_SIZE < 32 745 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 746 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 747# endif 748# endif 749 VMOVU %VEC(0), (%rdi) 750# if VEC_SIZE < 64 751 VMOVU %VEC(1), VEC_SIZE(%rdi) 752# if VEC_SIZE < 32 753 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) 754 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) 755# endif 756# endif 757 758 /* Adjust source, destination, and size. */ 759 movq %rdi, %r8 760 andq $63, %r8 761 /* Get the negative of offset for alignment. */ 762 subq $64, %r8 763 /* Adjust source. */ 764 subq %r8, %rsi 765 /* Adjust destination which should be aligned now. */ 766 subq %r8, %rdi 767 /* Adjust length. */ 768 addq %r8, %rdx 769 770 /* Test if source and destination addresses will alias. If they 771 do the larger pipeline in large_memcpy_4x alleviated the 772 performance drop. */ 773 774 /* ecx contains -(dst - src). not ecx will return dst - src - 1 775 which works for testing aliasing. */ 776 notl %ecx 777 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx 778 jz L(large_memcpy_4x) 779 780 movq %rdx, %r10 781 shrq $LOG_4X_MEMCPY_THRESH, %r10 782 cmp __x86_shared_non_temporal_threshold(%rip), %r10 783 jae L(large_memcpy_4x) 784 785 /* edx will store remainder size for copying tail. */ 786 andl $(PAGE_SIZE * 2 - 1), %edx 787 /* r10 stores outer loop counter. */ 788 shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 789 /* Copy 4x VEC at a time from 2 pages. */ 790 .p2align 4 791L(loop_large_memcpy_2x_outer): 792 /* ecx stores inner loop counter. */ 793 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx 794L(loop_large_memcpy_2x_inner): 795 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) 796 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) 797 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) 798 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) 799 /* Load vectors from rsi. */ 800 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 801 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 802 subq $-LARGE_LOAD_SIZE, %rsi 803 /* Non-temporal store vectors to rdi. */ 804 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 805 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 806 subq $-LARGE_LOAD_SIZE, %rdi 807 decl %ecx 808 jnz L(loop_large_memcpy_2x_inner) 809 addq $PAGE_SIZE, %rdi 810 addq $PAGE_SIZE, %rsi 811 decq %r10 812 jne L(loop_large_memcpy_2x_outer) 813 sfence 814 815 /* Check if only last 4 loads are needed. */ 816 cmpl $(VEC_SIZE * 4), %edx 817 jbe L(large_memcpy_2x_end) 818 819 /* Handle the last 2 * PAGE_SIZE bytes. */ 820L(loop_large_memcpy_2x_tail): 821 /* Copy 4 * VEC a time forward with non-temporal stores. */ 822 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) 823 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) 824 VMOVU (%rsi), %VEC(0) 825 VMOVU VEC_SIZE(%rsi), %VEC(1) 826 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 827 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 828 subq $-(VEC_SIZE * 4), %rsi 829 addl $-(VEC_SIZE * 4), %edx 830 VMOVA %VEC(0), (%rdi) 831 VMOVA %VEC(1), VEC_SIZE(%rdi) 832 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) 833 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) 834 subq $-(VEC_SIZE * 4), %rdi 835 cmpl $(VEC_SIZE * 4), %edx 836 ja L(loop_large_memcpy_2x_tail) 837 838L(large_memcpy_2x_end): 839 /* Store the last 4 * VEC. */ 840 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) 841 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) 842 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) 843 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) 844 845 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) 846 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) 847 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) 848 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) 849 VZEROUPPER_RETURN 850 851 .p2align 4 852L(large_memcpy_4x): 853 movq %rdx, %r10 854 /* edx will store remainder size for copying tail. */ 855 andl $(PAGE_SIZE * 4 - 1), %edx 856 /* r10 stores outer loop counter. */ 857 shrq $(LOG_PAGE_SIZE + 2), %r10 858 /* Copy 4x VEC at a time from 4 pages. */ 859 .p2align 4 860L(loop_large_memcpy_4x_outer): 861 /* ecx stores inner loop counter. */ 862 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx 863L(loop_large_memcpy_4x_inner): 864 /* Only one prefetch set per page as doing 4 pages give more 865 time for prefetcher to keep up. */ 866 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) 867 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) 868 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) 869 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) 870 /* Load vectors from rsi. */ 871 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 872 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 873 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) 874 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) 875 subq $-LARGE_LOAD_SIZE, %rsi 876 /* Non-temporal store vectors to rdi. */ 877 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) 878 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) 879 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) 880 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) 881 subq $-LARGE_LOAD_SIZE, %rdi 882 decl %ecx 883 jnz L(loop_large_memcpy_4x_inner) 884 addq $(PAGE_SIZE * 3), %rdi 885 addq $(PAGE_SIZE * 3), %rsi 886 decq %r10 887 jne L(loop_large_memcpy_4x_outer) 888 sfence 889 /* Check if only last 4 loads are needed. */ 890 cmpl $(VEC_SIZE * 4), %edx 891 jbe L(large_memcpy_4x_end) 892 893 /* Handle the last 4 * PAGE_SIZE bytes. */ 894L(loop_large_memcpy_4x_tail): 895 /* Copy 4 * VEC a time forward with non-temporal stores. */ 896 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) 897 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) 898 VMOVU (%rsi), %VEC(0) 899 VMOVU VEC_SIZE(%rsi), %VEC(1) 900 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) 901 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) 902 subq $-(VEC_SIZE * 4), %rsi 903 addl $-(VEC_SIZE * 4), %edx 904 VMOVA %VEC(0), (%rdi) 905 VMOVA %VEC(1), VEC_SIZE(%rdi) 906 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) 907 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) 908 subq $-(VEC_SIZE * 4), %rdi 909 cmpl $(VEC_SIZE * 4), %edx 910 ja L(loop_large_memcpy_4x_tail) 911 912L(large_memcpy_4x_end): 913 /* Store the last 4 * VEC. */ 914 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) 915 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) 916 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) 917 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) 918 919 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) 920 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) 921 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) 922 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) 923 VZEROUPPER_RETURN 924#endif 925END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) 926 927#if IS_IN (libc) 928# ifdef USE_MULTIARCH 929strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 930 MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) 931# ifdef SHARED 932strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), 933 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) 934# endif 935# endif 936# ifdef SHARED 937strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), 938 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) 939# endif 940#endif 941strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), 942 MEMCPY_SYMBOL (__memcpy, unaligned)) 943