1/* memcpy optimized with SSE2 unaligned memory access instructions.
2   Copyright (C) 2014-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc) \
20    && (defined SHARED \
21	|| defined USE_AS_MEMMOVE \
22	|| !defined USE_MULTIARCH)
23
24# include <sysdep.h>
25# include "asm-syntax.h"
26
27# ifndef MEMCPY
28#  define MEMCPY	__memcpy_sse2_unaligned
29#  define MEMCPY_CHK	__memcpy_chk_sse2_unaligned
30# endif
31
32# ifdef USE_AS_BCOPY
33#  define SRC		PARMS
34#  define DEST		SRC+4
35#  define LEN		DEST+4
36# else
37#  define DEST		PARMS
38#  define SRC		DEST+4
39#  define LEN		SRC+4
40# endif
41
42# define CFI_PUSH(REG)		\
43  cfi_adjust_cfa_offset (4);		\
44  cfi_rel_offset (REG, 0)
45
46# define CFI_POP(REG)		\
47  cfi_adjust_cfa_offset (-4);		\
48  cfi_restore (REG)
49
50# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
51# define POP(REG)	popl REG; CFI_POP (REG)
52
53# define PARMS		8		/* Preserve EBX.  */
54# define ENTRANCE	PUSH (%ebx);
55# define RETURN_END	POP (%ebx); ret
56# define RETURN	RETURN_END; CFI_PUSH (%ebx)
57
58	.section .text.sse2,"ax",@progbits
59# if !defined USE_AS_BCOPY && defined SHARED
60ENTRY (MEMCPY_CHK)
61	movl	12(%esp), %eax
62	cmpl	%eax, 16(%esp)
63	jb	HIDDEN_JUMPTARGET (__chk_fail)
64END (MEMCPY_CHK)
65# endif
66
67ENTRY (MEMCPY)
68	ENTRANCE
69	movl	LEN(%esp), %ecx
70	movl	SRC(%esp), %eax
71	movl	DEST(%esp), %edx
72	cmp	%edx, %eax
73
74# ifdef USE_AS_MEMMOVE
75	ja	L(check_forward)
76
77L(mm_len_0_or_more_backward):
78/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
79	separately.  */
80	cmp	$16, %ecx
81	jbe	L(mm_len_0_16_bytes_backward)
82
83	cmpl	$32, %ecx
84	ja	L(mm_len_32_or_more_backward)
85
86/* Copy [0..32] and return.  */
87	movdqu	(%eax), %xmm0
88	movdqu	-16(%eax, %ecx), %xmm1
89	movdqu	%xmm0, (%edx)
90	movdqu	%xmm1, -16(%edx, %ecx)
91	jmp	L(return)
92
93L(mm_len_32_or_more_backward):
94	cmpl	$64, %ecx
95	ja	L(mm_len_64_or_more_backward)
96
97/* Copy [0..64] and return.  */
98	movdqu	(%eax), %xmm0
99	movdqu	16(%eax), %xmm1
100	movdqu	-16(%eax, %ecx), %xmm2
101	movdqu	-32(%eax, %ecx), %xmm3
102	movdqu	%xmm0, (%edx)
103	movdqu	%xmm1, 16(%edx)
104	movdqu	%xmm2, -16(%edx, %ecx)
105	movdqu	%xmm3, -32(%edx, %ecx)
106	jmp	L(return)
107
108L(mm_len_64_or_more_backward):
109	cmpl	$128, %ecx
110	ja	L(mm_len_128_or_more_backward)
111
112/* Copy [0..128] and return.  */
113	movdqu	(%eax), %xmm0
114	movdqu	16(%eax), %xmm1
115	movdqu	32(%eax), %xmm2
116	movdqu	48(%eax), %xmm3
117	movdqu	-64(%eax, %ecx), %xmm4
118	movdqu	-48(%eax, %ecx), %xmm5
119	movdqu	-32(%eax, %ecx), %xmm6
120	movdqu	-16(%eax, %ecx), %xmm7
121	movdqu	%xmm0, (%edx)
122	movdqu	%xmm1, 16(%edx)
123	movdqu	%xmm2, 32(%edx)
124	movdqu	%xmm3, 48(%edx)
125	movdqu	%xmm4, -64(%edx, %ecx)
126	movdqu	%xmm5, -48(%edx, %ecx)
127	movdqu	%xmm6, -32(%edx, %ecx)
128	movdqu	%xmm7, -16(%edx, %ecx)
129	jmp	L(return)
130
131L(mm_len_128_or_more_backward):
132	add	%ecx, %eax
133	cmp	%edx, %eax
134	movl	SRC(%esp), %eax
135	jbe	L(forward)
136	PUSH (%esi)
137	PUSH (%edi)
138	PUSH (%ebx)
139
140/* Aligning the address of destination. */
141	movdqu	(%eax), %xmm4
142	movdqu	16(%eax), %xmm5
143	movdqu	32(%eax), %xmm6
144	movdqu	48(%eax), %xmm7
145	leal	(%edx, %ecx), %esi
146	movdqu	-16(%eax, %ecx), %xmm0
147	subl	$16, %esp
148	movdqu	%xmm0, (%esp)
149	mov	%ecx, %edi
150	movl	%esi, %ecx
151	andl	$-16, %ecx
152	leal	(%ecx), %ebx
153	subl	%edx, %ebx
154	leal	(%eax, %ebx), %eax
155	shrl	$6, %ebx
156
157# ifdef SHARED_CACHE_SIZE_HALF
158	cmp	$SHARED_CACHE_SIZE_HALF, %edi
159# else
160#  ifdef PIC
161	PUSH (%ebx)
162	SETUP_PIC_REG (bx)
163	add	$_GLOBAL_OFFSET_TABLE_, %ebx
164	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
165	POP (%ebx)
166#  else
167	cmp	__x86_shared_cache_size_half, %edi
168#  endif
169# endif
170	jae	L(mm_large_page_loop_backward)
171
172	.p2align 4
173L(mm_main_loop_backward):
174
175	prefetcht0 -128(%eax)
176
177	movdqu	-64(%eax), %xmm0
178	movdqu	-48(%eax), %xmm1
179	movdqu	-32(%eax), %xmm2
180	movdqu	-16(%eax), %xmm3
181	movaps	%xmm0, -64(%ecx)
182	subl	$64, %eax
183	movaps	%xmm1, -48(%ecx)
184	movaps	%xmm2, -32(%ecx)
185	movaps	%xmm3, -16(%ecx)
186	subl	$64, %ecx
187	sub	$1, %ebx
188	jnz	L(mm_main_loop_backward)
189	movdqu	(%esp), %xmm0
190	addl	$16, %esp
191	movdqu	%xmm0, -16(%esi)
192	movdqu	%xmm4, (%edx)
193	movdqu	%xmm5, 16(%edx)
194	movdqu	%xmm6, 32(%edx)
195	movdqu	%xmm7, 48(%edx)
196	POP (%ebx)
197	jmp	L(mm_return_pop_all)
198
199/* Copy [0..16] and return.  */
200L(mm_len_0_16_bytes_backward):
201	testb	$24, %cl
202	jnz	L(mm_len_9_16_bytes_backward)
203	testb	$4, %cl
204	.p2align 4,,5
205	jnz	L(mm_len_5_8_bytes_backward)
206	testl	%ecx, %ecx
207	.p2align 4,,2
208	je	L(return)
209	testb	$2, %cl
210	.p2align 4,,1
211	jne	L(mm_len_3_4_bytes_backward)
212	movzbl	-1(%eax,%ecx), %ebx
213	movzbl	(%eax), %eax
214	movb	%bl, -1(%edx,%ecx)
215	movb	%al, (%edx)
216	jmp	L(return)
217
218L(mm_len_3_4_bytes_backward):
219	movzwl	-2(%eax,%ecx), %ebx
220	movzwl	(%eax), %eax
221	movw	%bx, -2(%edx,%ecx)
222	movw	%ax, (%edx)
223	jmp	L(return)
224
225L(mm_len_9_16_bytes_backward):
226	PUSH (%esi)
227	movl	-4(%eax,%ecx), %ebx
228	movl	-8(%eax,%ecx), %esi
229	movl	%ebx, -4(%edx,%ecx)
230	movl	%esi, -8(%edx,%ecx)
231	subl	$8, %ecx
232	POP (%esi)
233	jmp	L(mm_len_0_16_bytes_backward)
234
235L(mm_len_5_8_bytes_backward):
236	movl	(%eax), %ebx
237	movl	-4(%eax,%ecx), %eax
238	movl	%ebx, (%edx)
239	movl	%eax, -4(%edx,%ecx)
240	jmp	L(return)
241
242/* Big length copy backward part.  */
243	.p2align 4
244L(mm_large_page_loop_backward):
245	movdqu	-64(%eax), %xmm0
246	movdqu	-48(%eax), %xmm1
247	movdqu	-32(%eax), %xmm2
248	movdqu	-16(%eax), %xmm3
249	movntdq	%xmm0, -64(%ecx)
250	subl	$64, %eax
251	movntdq	%xmm1, -48(%ecx)
252	movntdq	%xmm2, -32(%ecx)
253	movntdq	%xmm3, -16(%ecx)
254	subl	$64, %ecx
255	sub	$1, %ebx
256	jnz	L(mm_large_page_loop_backward)
257	sfence
258	movdqu	(%esp), %xmm0
259	addl	$16, %esp
260	movdqu	%xmm0, -16(%esi)
261	movdqu	%xmm4, (%edx)
262	movdqu	%xmm5, 16(%edx)
263	movdqu	%xmm6, 32(%edx)
264	movdqu	%xmm7, 48(%edx)
265	POP (%ebx)
266	jmp	L(mm_return_pop_all)
267
268L(check_forward):
269	add	%edx, %ecx
270	cmp	%eax, %ecx
271	movl	LEN(%esp), %ecx
272	jbe	L(forward)
273
274/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
275	separately.  */
276	cmp	$16, %ecx
277	jbe	L(mm_len_0_16_bytes_forward)
278
279	cmpl	$32, %ecx
280	ja	L(mm_len_32_or_more_forward)
281
282/* Copy [0..32] and return.  */
283	movdqu	(%eax), %xmm0
284	movdqu	-16(%eax, %ecx), %xmm1
285	movdqu	%xmm0, (%edx)
286	movdqu	%xmm1, -16(%edx, %ecx)
287	jmp	L(return)
288
289L(mm_len_32_or_more_forward):
290	cmpl	$64, %ecx
291	ja	L(mm_len_64_or_more_forward)
292
293/* Copy [0..64] and return.  */
294	movdqu	(%eax), %xmm0
295	movdqu	16(%eax), %xmm1
296	movdqu	-16(%eax, %ecx), %xmm2
297	movdqu	-32(%eax, %ecx), %xmm3
298	movdqu	%xmm0, (%edx)
299	movdqu	%xmm1, 16(%edx)
300	movdqu	%xmm2, -16(%edx, %ecx)
301	movdqu	%xmm3, -32(%edx, %ecx)
302	jmp	L(return)
303
304L(mm_len_64_or_more_forward):
305	cmpl	$128, %ecx
306	ja	L(mm_len_128_or_more_forward)
307
308/* Copy [0..128] and return.  */
309	movdqu	(%eax), %xmm0
310	movdqu	16(%eax), %xmm1
311	movdqu	32(%eax), %xmm2
312	movdqu	48(%eax), %xmm3
313	movdqu	-64(%eax, %ecx), %xmm4
314	movdqu	-48(%eax, %ecx), %xmm5
315	movdqu	-32(%eax, %ecx), %xmm6
316	movdqu	-16(%eax, %ecx), %xmm7
317	movdqu	%xmm0, (%edx)
318	movdqu	%xmm1, 16(%edx)
319	movdqu	%xmm2, 32(%edx)
320	movdqu	%xmm3, 48(%edx)
321	movdqu	%xmm4, -64(%edx, %ecx)
322	movdqu	%xmm5, -48(%edx, %ecx)
323	movdqu	%xmm6, -32(%edx, %ecx)
324	movdqu	%xmm7, -16(%edx, %ecx)
325	jmp	L(return)
326
327L(mm_len_128_or_more_forward):
328	PUSH (%esi)
329	PUSH (%edi)
330	PUSH (%ebx)
331
332/* Aligning the address of destination. */
333	movdqu	-16(%eax, %ecx), %xmm4
334	movdqu	-32(%eax, %ecx), %xmm5
335	movdqu	-48(%eax, %ecx), %xmm6
336	movdqu	-64(%eax, %ecx), %xmm7
337	leal	(%edx, %ecx), %esi
338	movdqu	(%eax), %xmm0
339	subl	$16, %esp
340	movdqu	%xmm0, (%esp)
341	mov	%ecx, %edi
342	leal	16(%edx), %ecx
343	andl	$-16, %ecx
344	movl	%ecx, %ebx
345	subl	%edx, %ebx
346	addl	%ebx, %eax
347	movl	%esi, %ebx
348	subl	%ecx, %ebx
349	shrl	$6, %ebx
350
351# ifdef SHARED_CACHE_SIZE_HALF
352	cmp	$SHARED_CACHE_SIZE_HALF, %edi
353# else
354#  ifdef PIC
355	PUSH (%ebx)
356	SETUP_PIC_REG(bx)
357	add	$_GLOBAL_OFFSET_TABLE_, %ebx
358	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %edi
359	POP (%ebx)
360#  else
361	cmp	__x86_shared_cache_size_half, %edi
362#  endif
363# endif
364	jae	L(mm_large_page_loop_forward)
365
366	.p2align 4
367L(mm_main_loop_forward):
368
369	prefetcht0 128(%eax)
370
371	movdqu	(%eax), %xmm0
372	movdqu	16(%eax), %xmm1
373	movdqu	32(%eax), %xmm2
374	movdqu	48(%eax), %xmm3
375	movdqa	%xmm0, (%ecx)
376	addl	$64, %eax
377	movaps	%xmm1, 16(%ecx)
378	movaps	%xmm2, 32(%ecx)
379	movaps	%xmm3, 48(%ecx)
380	addl	$64, %ecx
381	sub	$1, %ebx
382	jnz	L(mm_main_loop_forward)
383	movdqu	(%esp), %xmm0
384	addl	$16, %esp
385	movdqu	%xmm0, (%edx)
386	movdqu	%xmm4, -16(%esi)
387	movdqu	%xmm5, -32(%esi)
388	movdqu	%xmm6, -48(%esi)
389	movdqu	%xmm7, -64(%esi)
390	POP (%ebx)
391	jmp	L(mm_return_pop_all)
392
393L(mm_len_0_16_bytes_forward):
394	testb	$24, %cl
395	jne	L(mm_len_9_16_bytes_forward)
396	testb	$4, %cl
397	.p2align 4,,5
398	jne	L(mm_len_5_8_bytes_forward)
399	testl	%ecx, %ecx
400	.p2align 4,,2
401	je	L(return)
402	testb	$2, %cl
403	.p2align 4,,1
404	jne	L(mm_len_2_4_bytes_forward)
405	movzbl	-1(%eax,%ecx), %ebx
406	movzbl	(%eax), %eax
407	movb	%bl, -1(%edx,%ecx)
408	movb	%al, (%edx)
409	jmp	L(return)
410
411L(mm_len_2_4_bytes_forward):
412	movzwl	-2(%eax,%ecx), %ebx
413	movzwl	(%eax), %eax
414	movw	%bx, -2(%edx,%ecx)
415	movw	%ax, (%edx)
416	jmp	L(return)
417
418L(mm_len_5_8_bytes_forward):
419	movl	(%eax), %ebx
420	movl	-4(%eax,%ecx), %eax
421	movl	%ebx, (%edx)
422	movl	%eax, -4(%edx,%ecx)
423	jmp	L(return)
424
425L(mm_len_9_16_bytes_forward):
426	movq	(%eax), %xmm0
427	movq	-8(%eax, %ecx), %xmm1
428	movq	%xmm0, (%edx)
429	movq	%xmm1, -8(%edx, %ecx)
430	jmp	L(return)
431
432L(mm_return_pop_all):
433	movl	%edx, %eax
434	POP (%edi)
435	POP (%esi)
436	RETURN
437
438/* Big length copy forward part.  */
439	.p2align 4
440L(mm_large_page_loop_forward):
441	movdqu	(%eax), %xmm0
442	movdqu	16(%eax), %xmm1
443	movdqu	32(%eax), %xmm2
444	movdqu	48(%eax), %xmm3
445	movntdq	%xmm0, (%ecx)
446	addl	$64, %eax
447	movntdq	%xmm1, 16(%ecx)
448	movntdq	%xmm2, 32(%ecx)
449	movntdq	%xmm3, 48(%ecx)
450	addl	$64, %ecx
451	sub	$1, %ebx
452	jnz	L(mm_large_page_loop_forward)
453	sfence
454	movdqu	(%esp), %xmm0
455	addl	$16, %esp
456	movdqu	%xmm0, (%edx)
457	movdqu	%xmm4, -16(%esi)
458	movdqu	%xmm5, -32(%esi)
459	movdqu	%xmm6, -48(%esi)
460	movdqu	%xmm7, -64(%esi)
461	POP (%ebx)
462	jmp	L(mm_return_pop_all)
463# endif
464
465L(forward):
466	cmp	$16, %ecx
467	jbe	L(len_0_16_bytes)
468
469# ifdef SHARED_CACHE_SIZE_HALF
470	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
471# else
472#  ifdef PIC
473	SETUP_PIC_REG(bx)
474	add	$_GLOBAL_OFFSET_TABLE_, %ebx
475	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
476#  else
477	cmp	__x86_shared_cache_size_half, %ecx
478#  endif
479# endif
480	jae     L(large_page)
481
482	movdqu	(%eax), %xmm0
483	movdqu	-16(%eax, %ecx), %xmm1
484	cmpl    $32, %ecx
485	movdqu	%xmm0, (%edx)
486	movdqu	%xmm1, -16(%edx, %ecx)
487	jbe	L(return)
488
489	movdqu	16(%eax), %xmm0
490	movdqu	-32(%eax, %ecx), %xmm1
491	cmpl    $64, %ecx
492	movdqu	%xmm0, 16(%edx)
493	movdqu	%xmm1, -32(%edx, %ecx)
494	jbe	L(return)
495
496	movdqu	32(%eax), %xmm0
497	movdqu	48(%eax), %xmm1
498	movdqu	-48(%eax, %ecx), %xmm2
499	movdqu	-64(%eax, %ecx), %xmm3
500	cmpl    $128, %ecx
501	movdqu	%xmm0, 32(%edx)
502	movdqu	%xmm1, 48(%edx)
503	movdqu	%xmm2, -48(%edx, %ecx)
504	movdqu	%xmm3, -64(%edx, %ecx)
505	jbe	L(return)
506
507/* Now the main loop: we align the address of the destination.  */
508	leal	64(%edx), %ebx
509	andl	$-64, %ebx
510
511	addl	%edx, %ecx
512	andl	$-64, %ecx
513
514	subl	%edx, %eax
515
516/* We should stop two iterations before the termination
517	(in order not to misprefetch).  */
518	subl	$64, %ecx
519	cmpl	%ebx, %ecx
520	je	L(main_loop_just_one_iteration)
521
522	subl	$64, %ecx
523	cmpl	%ebx, %ecx
524	je	L(main_loop_last_two_iterations)
525
526	.p2align 4
527L(main_loop_cache):
528
529	prefetcht0 128(%ebx, %eax)
530
531	movdqu	(%ebx, %eax), %xmm0
532	movdqu	16(%ebx, %eax), %xmm1
533	movdqu	32(%ebx, %eax), %xmm2
534	movdqu	48(%ebx, %eax), %xmm3
535	movdqa	%xmm0, (%ebx)
536	movaps	%xmm1, 16(%ebx)
537	movaps	%xmm2, 32(%ebx)
538	movaps	%xmm3, 48(%ebx)
539	lea	64(%ebx), %ebx
540	cmpl	%ebx, %ecx
541	jne	L(main_loop_cache)
542
543L(main_loop_last_two_iterations):
544	movdqu	(%ebx, %eax), %xmm0
545	movdqu	16(%ebx, %eax), %xmm1
546	movdqu	32(%ebx, %eax), %xmm2
547	movdqu	48(%ebx, %eax), %xmm3
548	movdqu	64(%ebx, %eax), %xmm4
549	movdqu	80(%ebx, %eax), %xmm5
550	movdqu	96(%ebx, %eax), %xmm6
551	movdqu	112(%ebx, %eax), %xmm7
552	movdqa	%xmm0, (%ebx)
553	movaps	%xmm1, 16(%ebx)
554	movaps	%xmm2, 32(%ebx)
555	movaps	%xmm3, 48(%ebx)
556	movaps	%xmm4, 64(%ebx)
557	movaps	%xmm5, 80(%ebx)
558	movaps	%xmm6, 96(%ebx)
559	movaps	%xmm7, 112(%ebx)
560	jmp	L(return)
561
562L(main_loop_just_one_iteration):
563	movdqu	(%ebx, %eax), %xmm0
564	movdqu	16(%ebx, %eax), %xmm1
565	movdqu	32(%ebx, %eax), %xmm2
566	movdqu	48(%ebx, %eax), %xmm3
567	movdqa	%xmm0, (%ebx)
568	movaps	%xmm1, 16(%ebx)
569	movaps	%xmm2, 32(%ebx)
570	movaps	%xmm3, 48(%ebx)
571	jmp	L(return)
572
573L(large_page):
574	movdqu	(%eax), %xmm0
575	movdqu	16(%eax), %xmm1
576	movdqu	32(%eax), %xmm2
577	movdqu	48(%eax), %xmm3
578	movdqu	-64(%eax, %ecx), %xmm4
579	movdqu	-48(%eax, %ecx), %xmm5
580	movdqu	-32(%eax, %ecx), %xmm6
581	movdqu	-16(%eax, %ecx), %xmm7
582	movdqu	%xmm0, (%edx)
583	movdqu	%xmm1, 16(%edx)
584	movdqu	%xmm2, 32(%edx)
585	movdqu	%xmm3, 48(%edx)
586	movdqu	%xmm4, -64(%edx, %ecx)
587	movdqu	%xmm5, -48(%edx, %ecx)
588	movdqu	%xmm6, -32(%edx, %ecx)
589	movdqu	%xmm7, -16(%edx, %ecx)
590
591	movdqu	64(%eax), %xmm0
592	movdqu	80(%eax), %xmm1
593	movdqu	96(%eax), %xmm2
594	movdqu	112(%eax), %xmm3
595	movdqu	-128(%eax, %ecx), %xmm4
596	movdqu	-112(%eax, %ecx), %xmm5
597	movdqu	-96(%eax, %ecx), %xmm6
598	movdqu	-80(%eax, %ecx), %xmm7
599	movdqu	%xmm0, 64(%edx)
600	movdqu	%xmm1, 80(%edx)
601	movdqu	%xmm2, 96(%edx)
602	movdqu	%xmm3, 112(%edx)
603	movdqu	%xmm4, -128(%edx, %ecx)
604	movdqu	%xmm5, -112(%edx, %ecx)
605	movdqu	%xmm6, -96(%edx, %ecx)
606	movdqu	%xmm7, -80(%edx, %ecx)
607
608/* Now the main loop with non temporal stores. We align
609	the address of the destination.  */
610	leal	128(%edx), %ebx
611	andl	$-128, %ebx
612
613	addl	%edx, %ecx
614	andl	$-128, %ecx
615
616	subl	%edx, %eax
617
618	.p2align 4
619L(main_loop_large_page):
620	movdqu	(%ebx, %eax), %xmm0
621	movdqu	16(%ebx, %eax), %xmm1
622	movdqu	32(%ebx, %eax), %xmm2
623	movdqu	48(%ebx, %eax), %xmm3
624	movdqu	64(%ebx, %eax), %xmm4
625	movdqu	80(%ebx, %eax), %xmm5
626	movdqu	96(%ebx, %eax), %xmm6
627	movdqu	112(%ebx, %eax), %xmm7
628	movntdq	%xmm0, (%ebx)
629	movntdq	%xmm1, 16(%ebx)
630	movntdq	%xmm2, 32(%ebx)
631	movntdq	%xmm3, 48(%ebx)
632	movntdq	%xmm4, 64(%ebx)
633	movntdq	%xmm5, 80(%ebx)
634	movntdq	%xmm6, 96(%ebx)
635	movntdq	%xmm7, 112(%ebx)
636	lea	128(%ebx), %ebx
637	cmpl	%ebx, %ecx
638	jne	L(main_loop_large_page)
639	sfence
640	jmp	L(return)
641
642L(len_0_16_bytes):
643	testb	$24, %cl
644	jne	L(len_9_16_bytes)
645	testb	$4, %cl
646	.p2align 4,,5
647	jne	L(len_5_8_bytes)
648	testl	%ecx, %ecx
649	.p2align 4,,2
650	je	L(return)
651	movzbl	(%eax), %ebx
652	testb	$2, %cl
653	movb	%bl, (%edx)
654	je	L(return)
655	movzwl	-2(%eax,%ecx), %ebx
656	movw	%bx, -2(%edx,%ecx)
657	jmp	L(return)
658
659L(len_9_16_bytes):
660	movq	(%eax), %xmm0
661	movq	-8(%eax, %ecx), %xmm1
662	movq	%xmm0, (%edx)
663	movq	%xmm1, -8(%edx, %ecx)
664	jmp	L(return)
665
666L(len_5_8_bytes):
667	movl	(%eax), %ebx
668	movl	%ebx, (%edx)
669	movl	-4(%eax,%ecx), %ebx
670	movl	%ebx, -4(%edx,%ecx)
671
672L(return):
673	movl	%edx, %eax
674# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
675	movl	LEN(%esp), %ecx
676	add	%ecx, %eax
677# endif
678	RETURN
679
680END (MEMCPY)
681#endif
682