1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* memmove/memcpy/mempcpy is implemented as:
20   1. Use overlapping load and store to avoid branch.
21   2. Load all sources into registers and store them together to avoid
22      possible address overlap between source and destination.
23   3. If size is 8 * VEC_SIZE or less, load all sources into registers
24      and store them together.
25   4. If address of destination > address of source, backward copy
26      4 * VEC_SIZE at a time with unaligned load and aligned store.
27      Load the first 4 * VEC and last VEC before the loop and store
28      them after the loop to support overlapping addresses.
29   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30      load and aligned store.  Load the last 4 * VEC and first VEC
31      before the loop and store them after the loop to support
32      overlapping addresses.
33   6. On machines with ERMS feature, if size greater than equal or to
34      __x86_rep_movsb_threshold and less than
35      __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36   7. If size >= __x86_shared_non_temporal_threshold and there is no
37      overlap between destination and source, use non-temporal store
38      instead of aligned store copying from either 2 or 4 pages at
39      once.
40   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41      and source and destination do not page alias, copy from 2 pages
42      at once using non-temporal stores. Page aliasing in this case is
43      considered true if destination's page alignment - sources' page
44      alignment is less than 8 * VEC_SIZE.
45   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46      and destination do page alias copy from 4 pages at once using
47      non-temporal stores.  */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef XMM0
64# define XMM0				xmm0
65#endif
66
67#ifndef YMM0
68# define YMM0				ymm0
69#endif
70
71#ifndef VZEROUPPER
72# if VEC_SIZE > 16
73#  define VZEROUPPER vzeroupper
74# else
75#  define VZEROUPPER
76# endif
77#endif
78
79/* Whether to align before movsb. Ultimately we want 64 byte
80   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
81#define ALIGN_MOVSB	(VEC_SIZE > 16)
82/* Number of bytes to align movsb to.  */
83#define MOVSB_ALIGN_TO	64
84
85#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
86#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
87
88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
89# error MOV_SIZE Unknown
90#endif
91
92#if LARGE_MOV_SIZE
93# define SMALL_SIZE_OFFSET	(4)
94#else
95# define SMALL_SIZE_OFFSET	(0)
96#endif
97
98#ifndef PAGE_SIZE
99# define PAGE_SIZE 4096
100#endif
101
102#if PAGE_SIZE != 4096
103# error Unsupported PAGE_SIZE
104#endif
105
106#ifndef LOG_PAGE_SIZE
107# define LOG_PAGE_SIZE 12
108#endif
109
110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
111# error Invalid LOG_PAGE_SIZE
112#endif
113
114/* Byte per page for large_memcpy inner loop.  */
115#if VEC_SIZE == 64
116# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
117#else
118# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
119#endif
120
121/* Amount to shift rdx by to compare for memcpy_large_4x.  */
122#ifndef LOG_4X_MEMCPY_THRESH
123# define LOG_4X_MEMCPY_THRESH 4
124#endif
125
126/* Avoid short distance rep movsb only with non-SSE vector.  */
127#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
128# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
129#else
130# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
131#endif
132
133#ifndef PREFETCH
134# define PREFETCH(addr) prefetcht0 addr
135#endif
136
137/* Assume 64-byte prefetch size.  */
138#ifndef PREFETCH_SIZE
139# define PREFETCH_SIZE 64
140#endif
141
142#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
143
144#if PREFETCH_SIZE == 64
145# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
146#  define PREFETCH_ONE_SET(dir, base, offset) \
147	PREFETCH ((offset)base)
148# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
149#  define PREFETCH_ONE_SET(dir, base, offset) \
150	PREFETCH ((offset)base); \
151	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
152# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
153#  define PREFETCH_ONE_SET(dir, base, offset) \
154	PREFETCH ((offset)base); \
155	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
156	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
157	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
158# else
159#   error Unsupported PREFETCHED_LOAD_SIZE!
160# endif
161#else
162# error Unsupported PREFETCH_SIZE!
163#endif
164
165#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
166# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
167	VMOVU	(offset)base, vec0; \
168	VMOVU	((offset) + VEC_SIZE)base, vec1;
169# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
170	VMOVNT  vec0, (offset)base; \
171	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
172#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
173# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
174	VMOVU	(offset)base, vec0; \
175	VMOVU	((offset) + VEC_SIZE)base, vec1; \
176	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
177	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
178# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
179	VMOVNT	vec0, (offset)base; \
180	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
181	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
182	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
183#else
184# error Invalid LARGE_LOAD_SIZE
185#endif
186
187#ifndef SECTION
188# error SECTION is not defined!
189#endif
190
191	.section SECTION(.text),"ax",@progbits
192#if defined SHARED && IS_IN (libc)
193ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
194	cmp	%RDX_LP, %RCX_LP
195	jb	HIDDEN_JUMPTARGET (__chk_fail)
196END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
197#endif
198
199ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
200	mov	%RDI_LP, %RAX_LP
201	add	%RDX_LP, %RAX_LP
202	jmp	L(start)
203END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
204
205#if defined SHARED && IS_IN (libc)
206ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
207	cmp	%RDX_LP, %RCX_LP
208	jb	HIDDEN_JUMPTARGET (__chk_fail)
209END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
210#endif
211
212ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
213	movq	%rdi, %rax
214L(start):
215# ifdef __ILP32__
216	/* Clear the upper 32 bits.  */
217	movl	%edx, %edx
218# endif
219	cmp	$VEC_SIZE, %RDX_LP
220	jb	L(less_vec)
221	/* Load regardless.  */
222	VMOVU	(%rsi), %VEC(0)
223	cmp	$(VEC_SIZE * 2), %RDX_LP
224	ja	L(more_2x_vec)
225	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
226	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
227	VMOVU	%VEC(0), (%rdi)
228	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
229#if !(defined USE_MULTIARCH && IS_IN (libc))
230	ZERO_UPPER_VEC_REGISTERS_RETURN
231#else
232	VZEROUPPER_RETURN
233#endif
234#if defined USE_MULTIARCH && IS_IN (libc)
235END (MEMMOVE_SYMBOL (__memmove, unaligned))
236# if VEC_SIZE == 16
237ENTRY (__mempcpy_chk_erms)
238	cmp	%RDX_LP, %RCX_LP
239	jb	HIDDEN_JUMPTARGET (__chk_fail)
240END (__mempcpy_chk_erms)
241
242/* Only used to measure performance of REP MOVSB.  */
243ENTRY (__mempcpy_erms)
244	mov	%RDI_LP, %RAX_LP
245	/* Skip zero length.  */
246	test	%RDX_LP, %RDX_LP
247	jz	2f
248	add	%RDX_LP, %RAX_LP
249	jmp	L(start_movsb)
250END (__mempcpy_erms)
251
252ENTRY (__memmove_chk_erms)
253	cmp	%RDX_LP, %RCX_LP
254	jb	HIDDEN_JUMPTARGET (__chk_fail)
255END (__memmove_chk_erms)
256
257ENTRY (__memmove_erms)
258	movq	%rdi, %rax
259	/* Skip zero length.  */
260	test	%RDX_LP, %RDX_LP
261	jz	2f
262L(start_movsb):
263	mov	%RDX_LP, %RCX_LP
264	cmp	%RSI_LP, %RDI_LP
265	jb	1f
266	/* Source == destination is less common.  */
267	je	2f
268	lea	(%rsi,%rcx), %RDX_LP
269	cmp	%RDX_LP, %RDI_LP
270	jb	L(movsb_backward)
2711:
272	rep movsb
2732:
274	ret
275L(movsb_backward):
276	leaq	-1(%rdi,%rcx), %rdi
277	leaq	-1(%rsi,%rcx), %rsi
278	std
279	rep movsb
280	cld
281	ret
282END (__memmove_erms)
283strong_alias (__memmove_erms, __memcpy_erms)
284strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
285# endif
286
287# ifdef SHARED
288ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
289	cmp	%RDX_LP, %RCX_LP
290	jb	HIDDEN_JUMPTARGET (__chk_fail)
291END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
292# endif
293
294ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
295	mov	%RDI_LP, %RAX_LP
296	add	%RDX_LP, %RAX_LP
297	jmp	L(start_erms)
298END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
299
300# ifdef SHARED
301ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
302	cmp	%RDX_LP, %RCX_LP
303	jb	HIDDEN_JUMPTARGET (__chk_fail)
304END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
305# endif
306
307ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
308	movq	%rdi, %rax
309L(start_erms):
310# ifdef __ILP32__
311	/* Clear the upper 32 bits.  */
312	movl	%edx, %edx
313# endif
314	cmp	$VEC_SIZE, %RDX_LP
315	jb	L(less_vec)
316	/* Load regardless.  */
317	VMOVU	(%rsi), %VEC(0)
318	cmp	$(VEC_SIZE * 2), %RDX_LP
319	ja	L(movsb_more_2x_vec)
320	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
321	 */
322	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
323	VMOVU	%VEC(0), (%rdi)
324	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
325L(return):
326# if VEC_SIZE > 16
327	ZERO_UPPER_VEC_REGISTERS_RETURN
328# else
329	ret
330# endif
331#endif
332
333#if LARGE_MOV_SIZE
334	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
335	   ENTRY block and L(less_vec).  */
336	.p2align 4,, 8
337L(between_4_7):
338	/* From 4 to 7.  No branch when size == 4.  */
339	movl	(%rsi), %ecx
340	movl	(%rsi, %rdx), %esi
341	movl	%ecx, (%rdi)
342	movl	%esi, (%rdi, %rdx)
343	ret
344#endif
345
346	.p2align 4
347L(less_vec):
348	/* Less than 1 VEC.  */
349#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
350# error Unsupported VEC_SIZE!
351#endif
352#if VEC_SIZE > 32
353	cmpl	$32, %edx
354	jae	L(between_32_63)
355#endif
356#if VEC_SIZE > 16
357	cmpl	$16, %edx
358	jae	L(between_16_31)
359#endif
360	cmpl	$8, %edx
361	jae	L(between_8_15)
362#if SMALL_MOV_SIZE
363	cmpl	$4, %edx
364#else
365	subq	$4, %rdx
366#endif
367	jae	L(between_4_7)
368	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
369	jl	L(copy_0)
370	movb	(%rsi), %cl
371	je	L(copy_1)
372	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
373	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
374L(copy_1):
375	movb	%cl, (%rdi)
376L(copy_0):
377	ret
378
379#if SMALL_MOV_SIZE
380	.p2align 4,, 8
381L(between_4_7):
382	/* From 4 to 7.  No branch when size == 4.  */
383	movl	-4(%rsi, %rdx), %ecx
384	movl	(%rsi), %esi
385	movl	%ecx, -4(%rdi, %rdx)
386	movl	%esi, (%rdi)
387	ret
388#endif
389
390#if VEC_SIZE > 16
391	/* From 16 to 31.  No branch when size == 16.  */
392	.p2align 4,, 8
393L(between_16_31):
394	vmovdqu	(%rsi), %xmm0
395	vmovdqu	-16(%rsi, %rdx), %xmm1
396	vmovdqu	%xmm0, (%rdi)
397	vmovdqu	%xmm1, -16(%rdi, %rdx)
398	/* No ymm registers have been touched.  */
399	ret
400#endif
401
402#if VEC_SIZE > 32
403	.p2align 4,, 10
404L(between_32_63):
405	/* From 32 to 63.  No branch when size == 32.  */
406	VMOVU	(%rsi), %YMM0
407	VMOVU	-32(%rsi, %rdx), %YMM1
408	VMOVU	%YMM0, (%rdi)
409	VMOVU	%YMM1, -32(%rdi, %rdx)
410	VZEROUPPER_RETURN
411#endif
412
413	.p2align 4,, 10
414L(between_8_15):
415	/* From 8 to 15.  No branch when size == 8.  */
416	movq	-8(%rsi, %rdx), %rcx
417	movq	(%rsi), %rsi
418	movq	%rsi, (%rdi)
419	movq	%rcx, -8(%rdi, %rdx)
420	ret
421
422	.p2align 4,, 10
423L(last_4x_vec):
424	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
425
426	/* VEC(0) and VEC(1) have already been loaded.  */
427	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
428	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
429	VMOVU	%VEC(0), (%rdi)
430	VMOVU	%VEC(1), VEC_SIZE(%rdi)
431	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
432	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
433	VZEROUPPER_RETURN
434
435	.p2align 4
436#if defined USE_MULTIARCH && IS_IN (libc)
437L(movsb_more_2x_vec):
438	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
439	ja	L(movsb)
440#endif
441L(more_2x_vec):
442	/* More than 2 * VEC and there may be overlap between
443	   destination and source.  */
444	cmpq	$(VEC_SIZE * 8), %rdx
445	ja	L(more_8x_vec)
446	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
447	VMOVU	VEC_SIZE(%rsi), %VEC(1)
448	cmpq	$(VEC_SIZE * 4), %rdx
449	jbe	L(last_4x_vec)
450	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
451	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
452	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
453	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
454	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
455	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
456	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
457	VMOVU	%VEC(0), (%rdi)
458	VMOVU	%VEC(1), VEC_SIZE(%rdi)
459	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
460	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
461	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
462	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
463	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
464	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
465	VZEROUPPER_RETURN
466
467	.p2align 4,, 4
468L(more_8x_vec):
469	movq	%rdi, %rcx
470	subq	%rsi, %rcx
471	/* Go to backwards temporal copy if overlap no matter what as
472	   backward REP MOVSB is slow and we don't want to use NT stores if
473	   there is overlap.  */
474	cmpq	%rdx, %rcx
475	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
476	jb	L(more_8x_vec_backward_check_nop)
477	/* Check if non-temporal move candidate.  */
478#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
479	/* Check non-temporal store threshold.  */
480	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
481	ja	L(large_memcpy_2x)
482#endif
483	/* To reach this point there cannot be overlap and dst > src. So
484	   check for overlap and src > dst in which case correctness
485	   requires forward copy. Otherwise decide between backward/forward
486	   copy depending on address aliasing.  */
487
488	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
489	   but less than __x86_shared_non_temporal_threshold.  */
490L(more_8x_vec_check):
491	/* rcx contains dst - src. Add back length (rdx).  */
492	leaq	(%rcx, %rdx), %r8
493	/* If r8 has different sign than rcx then there is overlap so we
494	   must do forward copy.  */
495	xorq	%rcx, %r8
496	/* Isolate just sign bit of r8.  */
497	shrq	$63, %r8
498	/* Get 4k difference dst - src.  */
499	andl	$(PAGE_SIZE - 256), %ecx
500	/* If r8 is non-zero must do foward for correctness. Otherwise
501	   if ecx is non-zero there is 4k False Alaising so do backward
502	   copy.  */
503	addl	%r8d, %ecx
504	jz	L(more_8x_vec_backward)
505
506	/* if rdx is greater than __x86_shared_non_temporal_threshold
507	   but there is overlap, or from short distance movsb.  */
508L(more_8x_vec_forward):
509	/* Load first and last 4 * VEC to support overlapping addresses.
510	 */
511
512	/* First vec was already loaded into VEC(0).  */
513	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
514	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
515	/* Save begining of dst.  */
516	movq	%rdi, %rcx
517	/* Align dst to VEC_SIZE - 1.  */
518	orq	$(VEC_SIZE - 1), %rdi
519	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
520	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
521
522	/* Subtract dst from src. Add back after dst aligned.  */
523	subq	%rcx, %rsi
524	/* Finish aligning dst.  */
525	incq	%rdi
526	/* Restore src adjusted with new value for aligned dst.  */
527	addq	%rdi, %rsi
528	/* Store end of buffer minus tail in rdx.  */
529	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
530
531	/* Dont use multi-byte nop to align.  */
532	.p2align 4,, 11
533L(loop_4x_vec_forward):
534	/* Copy 4 * VEC a time forward.  */
535	VMOVU	(%rsi), %VEC(1)
536	VMOVU	VEC_SIZE(%rsi), %VEC(2)
537	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
538	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
539	subq	$-(VEC_SIZE * 4), %rsi
540	VMOVA	%VEC(1), (%rdi)
541	VMOVA	%VEC(2), VEC_SIZE(%rdi)
542	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
543	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
544	subq	$-(VEC_SIZE * 4), %rdi
545	cmpq	%rdi, %rdx
546	ja	L(loop_4x_vec_forward)
547	/* Store the last 4 * VEC.  */
548	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
549	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
550	VMOVU	%VEC(7), VEC_SIZE(%rdx)
551	VMOVU	%VEC(8), (%rdx)
552	/* Store the first VEC.  */
553	VMOVU	%VEC(0), (%rcx)
554	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
555	 */
556L(nop_backward):
557	VZEROUPPER_RETURN
558
559	.p2align 4,, 8
560L(more_8x_vec_backward_check_nop):
561	/* rcx contains dst - src. Test for dst == src to skip all of
562	   memmove.  */
563	testq	%rcx, %rcx
564	jz	L(nop_backward)
565L(more_8x_vec_backward):
566	/* Load the first 4 * VEC and last VEC to support overlapping
567	   addresses.  */
568
569	/* First vec was also loaded into VEC(0).  */
570	VMOVU	VEC_SIZE(%rsi), %VEC(5)
571	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
572	/* Begining of region for 4x backward copy stored in rcx.  */
573	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
574	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
575	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
576	/* Subtract dst from src. Add back after dst aligned.  */
577	subq	%rdi, %rsi
578	/* Align dst.  */
579	andq	$-(VEC_SIZE), %rcx
580	/* Restore src.  */
581	addq	%rcx, %rsi
582
583	/* Don't use multi-byte nop to align.  */
584	.p2align 4,, 11
585L(loop_4x_vec_backward):
586	/* Copy 4 * VEC a time backward.  */
587	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
588	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
589	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
590	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
591	addq	$(VEC_SIZE * -4), %rsi
592	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
593	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
594	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
595	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
596	addq	$(VEC_SIZE * -4), %rcx
597	cmpq	%rcx, %rdi
598	jb	L(loop_4x_vec_backward)
599	/* Store the first 4 * VEC.  */
600	VMOVU	%VEC(0), (%rdi)
601	VMOVU	%VEC(5), VEC_SIZE(%rdi)
602	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
603	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
604	/* Store the last VEC.  */
605	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
606	VZEROUPPER_RETURN
607
608#if defined USE_MULTIARCH && IS_IN (libc)
609	/* L(skip_short_movsb_check) is only used with ERMS. Not for
610	   FSRM.  */
611	.p2align 5,, 16
612# if ALIGN_MOVSB
613L(skip_short_movsb_check):
614#  if MOVSB_ALIGN_TO > VEC_SIZE
615	VMOVU	VEC_SIZE(%rsi), %VEC(1)
616#  endif
617#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
618#   error Unsupported MOVSB_ALIGN_TO
619#  endif
620	/* If CPU does not have FSRM two options for aligning. Align src
621	   if dst and src 4k alias. Otherwise align dst.  */
622	testl	$(PAGE_SIZE - 512), %ecx
623	jnz	L(movsb_align_dst)
624	/* Fall through. dst and src 4k alias. It's better to align src
625	   here because the bottleneck will be loads dues to the false
626	   dependency on dst.  */
627
628	/* rcx already has dst - src.  */
629	movq	%rcx, %r9
630	/* Add src to len. Subtract back after src aligned. -1 because
631	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
632	leaq	-1(%rsi, %rdx), %rcx
633	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
634	orq	$(MOVSB_ALIGN_TO - 1), %rsi
635	/* Restore dst and len adjusted with new values for aligned dst.
636	 */
637	leaq	1(%rsi, %r9), %rdi
638	subq	%rsi, %rcx
639	/* Finish aligning src.  */
640	incq	%rsi
641
642	rep	movsb
643
644	VMOVU	%VEC(0), (%r8)
645#  if MOVSB_ALIGN_TO > VEC_SIZE
646	VMOVU	%VEC(1), VEC_SIZE(%r8)
647#  endif
648	VZEROUPPER_RETURN
649# endif
650
651	.p2align 4,, 12
652L(movsb):
653	movq	%rdi, %rcx
654	subq	%rsi, %rcx
655	/* Go to backwards temporal copy if overlap no matter what as
656	   backward REP MOVSB is slow and we don't want to use NT stores if
657	   there is overlap.  */
658	cmpq	%rdx, %rcx
659	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
660	jb	L(more_8x_vec_backward_check_nop)
661# if ALIGN_MOVSB
662	/* Save dest for storing aligning VECs later.  */
663	movq	%rdi, %r8
664# endif
665	/* If above __x86_rep_movsb_stop_threshold most likely is
666	   candidate for NT moves aswell.  */
667	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
668	jae	L(large_memcpy_2x_check)
669# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
670	/* Only avoid short movsb if CPU has FSRM.  */
671	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
672	jz	L(skip_short_movsb_check)
673#  if AVOID_SHORT_DISTANCE_REP_MOVSB
674	/* Avoid "rep movsb" if RCX, the distance between source and
675	   destination, is N*4GB + [1..63] with N >= 0.  */
676
677	/* ecx contains dst - src. Early check for backward copy
678	   conditions means only case of slow movsb with src = dst + [0,
679	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
680	   for that case.  */
681	cmpl	$-64, %ecx
682	ja	L(more_8x_vec_forward)
683#  endif
684# endif
685# if ALIGN_MOVSB
686#  if MOVSB_ALIGN_TO > VEC_SIZE
687	VMOVU	VEC_SIZE(%rsi), %VEC(1)
688#  endif
689#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
690#   error Unsupported MOVSB_ALIGN_TO
691#  endif
692	/* Fall through means cpu has FSRM. In that case exclusively
693	   align destination.  */
694L(movsb_align_dst):
695	/* Subtract dst from src. Add back after dst aligned.  */
696	subq	%rdi, %rsi
697	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
698	addq	$(MOVSB_ALIGN_TO - 1), %rdi
699	/* Add dst to len. Subtract back after dst aligned.  */
700	leaq	(%r8, %rdx), %rcx
701	/* Finish aligning dst.  */
702	andq	$-(MOVSB_ALIGN_TO), %rdi
703	/* Restore src and len adjusted with new values for aligned dst.
704	 */
705	addq	%rdi, %rsi
706	subq	%rdi, %rcx
707
708	rep	movsb
709
710	/* Store VECs loaded for aligning.  */
711	VMOVU	%VEC(0), (%r8)
712#  if MOVSB_ALIGN_TO > VEC_SIZE
713	VMOVU	%VEC(1), VEC_SIZE(%r8)
714#  endif
715	VZEROUPPER_RETURN
716# else	/* !ALIGN_MOVSB.  */
717L(skip_short_movsb_check):
718	mov	%RDX_LP, %RCX_LP
719	rep	movsb
720	ret
721# endif
722#endif
723
724	.p2align 4,, 10
725#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
726L(large_memcpy_2x_check):
727	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
728	jb	L(more_8x_vec_check)
729L(large_memcpy_2x):
730	/* To reach this point it is impossible for dst > src and
731	   overlap. Remaining to check is src > dst and overlap. rcx
732	   already contains dst - src. Negate rcx to get src - dst. If
733	   length > rcx then there is overlap and forward copy is best.  */
734	negq	%rcx
735	cmpq	%rcx, %rdx
736	ja	L(more_8x_vec_forward)
737
738	/* Cache align destination. First store the first 64 bytes then
739	   adjust alignments.  */
740
741	/* First vec was also loaded into VEC(0).  */
742# if VEC_SIZE < 64
743	VMOVU	VEC_SIZE(%rsi), %VEC(1)
744#  if VEC_SIZE < 32
745	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
746	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
747#  endif
748# endif
749	VMOVU	%VEC(0), (%rdi)
750# if VEC_SIZE < 64
751	VMOVU	%VEC(1), VEC_SIZE(%rdi)
752#  if VEC_SIZE < 32
753	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
754	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
755#  endif
756# endif
757
758	/* Adjust source, destination, and size.  */
759	movq	%rdi, %r8
760	andq	$63, %r8
761	/* Get the negative of offset for alignment.  */
762	subq	$64, %r8
763	/* Adjust source.  */
764	subq	%r8, %rsi
765	/* Adjust destination which should be aligned now.  */
766	subq	%r8, %rdi
767	/* Adjust length.  */
768	addq	%r8, %rdx
769
770	/* Test if source and destination addresses will alias. If they
771	   do the larger pipeline in large_memcpy_4x alleviated the
772	   performance drop.  */
773
774	/* ecx contains -(dst - src). not ecx will return dst - src - 1
775	   which works for testing aliasing.  */
776	notl	%ecx
777	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
778	jz	L(large_memcpy_4x)
779
780	movq	%rdx, %r10
781	shrq	$LOG_4X_MEMCPY_THRESH, %r10
782	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
783	jae	L(large_memcpy_4x)
784
785	/* edx will store remainder size for copying tail.  */
786	andl	$(PAGE_SIZE * 2 - 1), %edx
787	/* r10 stores outer loop counter.  */
788	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
789	/* Copy 4x VEC at a time from 2 pages.  */
790	.p2align 4
791L(loop_large_memcpy_2x_outer):
792	/* ecx stores inner loop counter.  */
793	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
794L(loop_large_memcpy_2x_inner):
795	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
796	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
797	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
798	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
799	/* Load vectors from rsi.  */
800	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
801	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
802	subq	$-LARGE_LOAD_SIZE, %rsi
803	/* Non-temporal store vectors to rdi.  */
804	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
805	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
806	subq	$-LARGE_LOAD_SIZE, %rdi
807	decl	%ecx
808	jnz	L(loop_large_memcpy_2x_inner)
809	addq	$PAGE_SIZE, %rdi
810	addq	$PAGE_SIZE, %rsi
811	decq	%r10
812	jne	L(loop_large_memcpy_2x_outer)
813	sfence
814
815	/* Check if only last 4 loads are needed.  */
816	cmpl	$(VEC_SIZE * 4), %edx
817	jbe	L(large_memcpy_2x_end)
818
819	/* Handle the last 2 * PAGE_SIZE bytes.  */
820L(loop_large_memcpy_2x_tail):
821	/* Copy 4 * VEC a time forward with non-temporal stores.  */
822	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
823	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
824	VMOVU	(%rsi), %VEC(0)
825	VMOVU	VEC_SIZE(%rsi), %VEC(1)
826	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
827	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
828	subq	$-(VEC_SIZE * 4), %rsi
829	addl	$-(VEC_SIZE * 4), %edx
830	VMOVA	%VEC(0), (%rdi)
831	VMOVA	%VEC(1), VEC_SIZE(%rdi)
832	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
833	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
834	subq	$-(VEC_SIZE * 4), %rdi
835	cmpl	$(VEC_SIZE * 4), %edx
836	ja	L(loop_large_memcpy_2x_tail)
837
838L(large_memcpy_2x_end):
839	/* Store the last 4 * VEC.  */
840	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
841	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
842	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
843	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
844
845	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
846	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
847	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
848	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
849	VZEROUPPER_RETURN
850
851	.p2align 4
852L(large_memcpy_4x):
853	movq	%rdx, %r10
854	/* edx will store remainder size for copying tail.  */
855	andl	$(PAGE_SIZE * 4 - 1), %edx
856	/* r10 stores outer loop counter.  */
857	shrq	$(LOG_PAGE_SIZE + 2), %r10
858	/* Copy 4x VEC at a time from 4 pages.  */
859	.p2align 4
860L(loop_large_memcpy_4x_outer):
861	/* ecx stores inner loop counter.  */
862	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
863L(loop_large_memcpy_4x_inner):
864	/* Only one prefetch set per page as doing 4 pages give more
865	   time for prefetcher to keep up.  */
866	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
867	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
868	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
869	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
870	/* Load vectors from rsi.  */
871	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
872	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
873	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
874	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
875	subq	$-LARGE_LOAD_SIZE, %rsi
876	/* Non-temporal store vectors to rdi.  */
877	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
878	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
879	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
880	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
881	subq	$-LARGE_LOAD_SIZE, %rdi
882	decl	%ecx
883	jnz	L(loop_large_memcpy_4x_inner)
884	addq	$(PAGE_SIZE * 3), %rdi
885	addq	$(PAGE_SIZE * 3), %rsi
886	decq	%r10
887	jne	L(loop_large_memcpy_4x_outer)
888	sfence
889	/* Check if only last 4 loads are needed.  */
890	cmpl	$(VEC_SIZE * 4), %edx
891	jbe	L(large_memcpy_4x_end)
892
893	/* Handle the last 4  * PAGE_SIZE bytes.  */
894L(loop_large_memcpy_4x_tail):
895	/* Copy 4 * VEC a time forward with non-temporal stores.  */
896	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
897	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
898	VMOVU	(%rsi), %VEC(0)
899	VMOVU	VEC_SIZE(%rsi), %VEC(1)
900	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
901	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
902	subq	$-(VEC_SIZE * 4), %rsi
903	addl	$-(VEC_SIZE * 4), %edx
904	VMOVA	%VEC(0), (%rdi)
905	VMOVA	%VEC(1), VEC_SIZE(%rdi)
906	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
907	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
908	subq	$-(VEC_SIZE * 4), %rdi
909	cmpl	$(VEC_SIZE * 4), %edx
910	ja	L(loop_large_memcpy_4x_tail)
911
912L(large_memcpy_4x_end):
913	/* Store the last 4 * VEC.  */
914	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
915	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
916	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
917	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
918
919	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
920	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
921	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
922	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
923	VZEROUPPER_RETURN
924#endif
925END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
926
927#if IS_IN (libc)
928# ifdef USE_MULTIARCH
929strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
930	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
931#  ifdef SHARED
932strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
933	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
934#  endif
935# endif
936# ifdef SHARED
937strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
938	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
939# endif
940#endif
941strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
942	      MEMCPY_SYMBOL (__memcpy, unaligned))
943