1/* memcpy with SSSE3 and REP string.
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#if IS_IN (libc) \
22    && (defined SHARED \
23	|| defined USE_AS_MEMMOVE \
24	|| !defined USE_MULTIARCH)
25
26#include "asm-syntax.h"
27
28#ifndef MEMCPY
29# define MEMCPY		__memcpy_ssse3_rep
30# define MEMCPY_CHK	__memcpy_chk_ssse3_rep
31#endif
32
33#ifdef USE_AS_BCOPY
34# define SRC		PARMS
35# define DEST		SRC+4
36# define LEN		DEST+4
37#else
38# define DEST		PARMS
39# define SRC		DEST+4
40# define LEN		SRC+4
41#endif
42
43#define CFI_PUSH(REG)						\
44  cfi_adjust_cfa_offset (4);					\
45  cfi_rel_offset (REG, 0)
46
47#define CFI_POP(REG)						\
48  cfi_adjust_cfa_offset (-4);					\
49  cfi_restore (REG)
50
51#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
52#define POP(REG)	popl REG; CFI_POP (REG)
53
54#ifdef PIC
55# define PARMS		8		/* Preserve EBX.  */
56# define ENTRANCE	PUSH (%ebx);
57# define RETURN_END	POP (%ebx); ret
58# define RETURN		RETURN_END; CFI_PUSH (%ebx)
59# define JMPTBL(I, B)	I - B
60
61/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
62   jump table with relative offsets.  INDEX is a register contains the
63   index into the jump table.   SCALE is the scale of INDEX. */
64# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
65    /* We first load PC into EBX.  */				\
66    SETUP_PIC_REG(bx);						\
67    /* Get the address of the jump table.  */			\
68    addl	$(TABLE - .), %ebx;				\
69    /* Get the entry and convert the relative offset to the	\
70       absolute address.  */					\
71    addl	(%ebx,INDEX,SCALE), %ebx;			\
72    /* We loaded the jump table.  Go.  */			\
73    _CET_NOTRACK jmp *%ebx
74
75# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
76    addl	$(TABLE - .), %ebx
77
78# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
79    addl	(%ebx,INDEX,SCALE), %ebx;			\
80    /* We loaded the jump table.  Go.  */			\
81    _CET_NOTRACK jmp *%ebx
82#else
83# define PARMS		4
84# define ENTRANCE
85# define RETURN_END	ret
86# define RETURN		RETURN_END
87# define JMPTBL(I, B)	I
88
89/* Branch to an entry in a jump table.  TABLE is a jump table with
90   absolute offsets.  INDEX is a register contains the index into the
91   jump table.  SCALE is the scale of INDEX. */
92# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
93    _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
94
95# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
96
97# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
98    _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
99#endif
100
101	.section .text.ssse3,"ax",@progbits
102#if !defined USE_AS_BCOPY && defined SHARED
103ENTRY (MEMCPY_CHK)
104	movl	12(%esp), %eax
105	cmpl	%eax, 16(%esp)
106	jb	HIDDEN_JUMPTARGET (__chk_fail)
107END (MEMCPY_CHK)
108#endif
109ENTRY (MEMCPY)
110	ENTRANCE
111	movl	LEN(%esp), %ecx
112	movl	SRC(%esp), %eax
113	movl	DEST(%esp), %edx
114
115#ifdef USE_AS_MEMMOVE
116	cmp	%eax, %edx
117	jb	L(copy_forward)
118	je	L(fwd_write_0bytes)
119	cmp	$48, %ecx
120	jb	L(bk_write_less48bytes)
121	add	%ecx, %eax
122	cmp	%eax, %edx
123	movl	SRC(%esp), %eax
124	jb	L(copy_backward)
125
126L(copy_forward):
127#endif
128	cmp	$48, %ecx
129	jae	L(48bytesormore)
130
131L(fwd_write_less32bytes):
132#ifndef USE_AS_MEMMOVE
133	cmp	%dl, %al
134	jb	L(bk_write)
135#endif
136	add	%ecx, %edx
137	add	%ecx, %eax
138	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
139#ifndef USE_AS_MEMMOVE
140L(bk_write):
141	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
142#endif
143
144	ALIGN (4)
145/* ECX > 32 and EDX is 4 byte aligned.  */
146L(48bytesormore):
147	movdqu	(%eax), %xmm0
148	PUSH (%edi)
149	movl	%edx, %edi
150	and	$-16, %edx
151	PUSH (%esi)
152	cfi_remember_state
153	add	$16, %edx
154	movl	%edi, %esi
155	sub	%edx, %edi
156	add	%edi, %ecx
157	sub	%edi, %eax
158
159#ifdef SHARED_CACHE_SIZE_HALF
160	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
161#else
162# ifdef PIC
163	SETUP_PIC_REG(bx)
164	add	$_GLOBAL_OFFSET_TABLE_, %ebx
165	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
166# else
167	cmp	__x86_shared_cache_size_half, %ecx
168# endif
169#endif
170
171	mov	%eax, %edi
172	jae	L(large_page)
173	and	$0xf, %edi
174	jz	L(shl_0)
175
176	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
177
178	ALIGN (4)
179L(shl_0):
180	movdqu	%xmm0, (%esi)
181	xor	%edi, %edi
182	cmp	$127, %ecx
183	ja	L(shl_0_gobble)
184	lea	-32(%ecx), %ecx
185L(shl_0_loop):
186	movdqa	(%eax, %edi), %xmm0
187	movdqa	16(%eax, %edi), %xmm1
188	sub	$32, %ecx
189	movdqa	%xmm0, (%edx, %edi)
190	movdqa	%xmm1, 16(%edx, %edi)
191	lea	32(%edi), %edi
192	jb	L(shl_0_end)
193
194	movdqa	(%eax, %edi), %xmm0
195	movdqa	16(%eax, %edi), %xmm1
196	sub	$32, %ecx
197	movdqa	%xmm0, (%edx, %edi)
198	movdqa	%xmm1, 16(%edx, %edi)
199	lea	32(%edi), %edi
200	jb	L(shl_0_end)
201
202	movdqa	(%eax, %edi), %xmm0
203	movdqa	16(%eax, %edi), %xmm1
204	sub	$32, %ecx
205	movdqa	%xmm0, (%edx, %edi)
206	movdqa	%xmm1, 16(%edx, %edi)
207	lea	32(%edi), %edi
208	jb	L(shl_0_end)
209
210	movdqa	(%eax, %edi), %xmm0
211	movdqa	16(%eax, %edi), %xmm1
212	sub	$32, %ecx
213	movdqa	%xmm0, (%edx, %edi)
214	movdqa	%xmm1, 16(%edx, %edi)
215	lea	32(%edi), %edi
216L(shl_0_end):
217	lea	32(%ecx), %ecx
218	add	%ecx, %edi
219	add	%edi, %edx
220	add	%edi, %eax
221	POP (%esi)
222	POP (%edi)
223	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
224
225	cfi_restore_state
226	cfi_remember_state
227L(shl_0_gobble):
228
229#ifdef DATA_CACHE_SIZE_HALF
230	cmp	$DATA_CACHE_SIZE_HALF, %ecx
231#else
232# ifdef PIC
233	SETUP_PIC_REG(bx)
234	add	$_GLOBAL_OFFSET_TABLE_, %ebx
235	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
236# else
237	mov	__x86_data_cache_size_half, %edi
238# endif
239#endif
240	mov	%edi, %esi
241	shr	$3, %esi
242	sub	%esi, %edi
243	cmp	%edi, %ecx
244	jae	L(shl_0_gobble_mem_start)
245	sub	$128, %ecx
246	ALIGN (4)
247L(shl_0_gobble_cache_loop):
248	movdqa	(%eax), %xmm0
249	movaps	0x10(%eax), %xmm1
250	movaps	0x20(%eax), %xmm2
251	movaps	0x30(%eax), %xmm3
252	movaps	0x40(%eax), %xmm4
253	movaps	0x50(%eax), %xmm5
254	movaps	0x60(%eax), %xmm6
255	movaps	0x70(%eax), %xmm7
256	lea	0x80(%eax), %eax
257	sub	$128, %ecx
258	movdqa	%xmm0, (%edx)
259	movaps	%xmm1, 0x10(%edx)
260	movaps	%xmm2, 0x20(%edx)
261	movaps	%xmm3, 0x30(%edx)
262	movaps	%xmm4, 0x40(%edx)
263	movaps	%xmm5, 0x50(%edx)
264	movaps	%xmm6, 0x60(%edx)
265	movaps	%xmm7, 0x70(%edx)
266	lea	0x80(%edx), %edx
267
268	jae	L(shl_0_gobble_cache_loop)
269	add	$0x80, %ecx
270	cmp	$0x40, %ecx
271	jb	L(shl_0_cache_less_64bytes)
272
273	movdqa	(%eax), %xmm0
274	sub	$0x40, %ecx
275	movdqa	0x10(%eax), %xmm1
276
277	movdqa	%xmm0, (%edx)
278	movdqa	%xmm1, 0x10(%edx)
279
280	movdqa	0x20(%eax), %xmm0
281	movdqa	0x30(%eax), %xmm1
282	add	$0x40, %eax
283
284	movdqa	%xmm0, 0x20(%edx)
285	movdqa	%xmm1, 0x30(%edx)
286	add	$0x40, %edx
287L(shl_0_cache_less_64bytes):
288	cmp	$0x20, %ecx
289	jb	L(shl_0_cache_less_32bytes)
290	movdqa	(%eax), %xmm0
291	sub	$0x20, %ecx
292	movdqa	0x10(%eax), %xmm1
293	add	$0x20, %eax
294	movdqa	%xmm0, (%edx)
295	movdqa	%xmm1, 0x10(%edx)
296	add	$0x20, %edx
297L(shl_0_cache_less_32bytes):
298	cmp	$0x10, %ecx
299	jb	L(shl_0_cache_less_16bytes)
300	sub	$0x10, %ecx
301	movdqa	(%eax), %xmm0
302	add	$0x10, %eax
303	movdqa	%xmm0, (%edx)
304	add	$0x10, %edx
305L(shl_0_cache_less_16bytes):
306	add	%ecx, %edx
307	add	%ecx, %eax
308	POP (%esi)
309	POP (%edi)
310	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
311
312	cfi_restore_state
313	cfi_remember_state
314	ALIGN (4)
315L(shl_0_gobble_mem_start):
316	cmp	%al, %dl
317	je	L(copy_page_by_rep)
318	sub	$128, %ecx
319L(shl_0_gobble_mem_loop):
320	prefetchnta 0x1c0(%eax)
321	prefetchnta 0x280(%eax)
322	prefetchnta 0x1c0(%edx)
323	prefetchnta 0x280(%edx)
324
325	movdqa	(%eax), %xmm0
326	movaps	0x10(%eax), %xmm1
327	movaps	0x20(%eax), %xmm2
328	movaps	0x30(%eax), %xmm3
329	movaps	0x40(%eax), %xmm4
330	movaps	0x50(%eax), %xmm5
331	movaps	0x60(%eax), %xmm6
332	movaps	0x70(%eax), %xmm7
333	lea	0x80(%eax), %eax
334	sub	$0x80, %ecx
335	movdqa	%xmm0, (%edx)
336	movaps	%xmm1, 0x10(%edx)
337	movaps	%xmm2, 0x20(%edx)
338	movaps	%xmm3, 0x30(%edx)
339	movaps	%xmm4, 0x40(%edx)
340	movaps	%xmm5, 0x50(%edx)
341	movaps	%xmm6, 0x60(%edx)
342	movaps	%xmm7, 0x70(%edx)
343	lea	0x80(%edx), %edx
344
345	jae	L(shl_0_gobble_mem_loop)
346	add	$0x80, %ecx
347	cmp	$0x40, %ecx
348	jb	L(shl_0_mem_less_64bytes)
349
350	movdqa	(%eax), %xmm0
351	sub	$0x40, %ecx
352	movdqa	0x10(%eax), %xmm1
353
354	movdqa	%xmm0, (%edx)
355	movdqa	%xmm1, 0x10(%edx)
356
357	movdqa	0x20(%eax), %xmm0
358	movdqa	0x30(%eax), %xmm1
359	add	$0x40, %eax
360
361	movdqa	%xmm0, 0x20(%edx)
362	movdqa	%xmm1, 0x30(%edx)
363	add	$0x40, %edx
364L(shl_0_mem_less_64bytes):
365	cmp	$0x20, %ecx
366	jb	L(shl_0_mem_less_32bytes)
367	movdqa	(%eax), %xmm0
368	sub	$0x20, %ecx
369	movdqa	0x10(%eax), %xmm1
370	add	$0x20, %eax
371	movdqa	%xmm0, (%edx)
372	movdqa	%xmm1, 0x10(%edx)
373	add	$0x20, %edx
374L(shl_0_mem_less_32bytes):
375	cmp	$0x10, %ecx
376	jb	L(shl_0_mem_less_16bytes)
377	sub	$0x10, %ecx
378	movdqa	(%eax), %xmm0
379	add	$0x10, %eax
380	movdqa	%xmm0, (%edx)
381	add	$0x10, %edx
382L(shl_0_mem_less_16bytes):
383	add	%ecx, %edx
384	add	%ecx, %eax
385	POP (%esi)
386	POP (%edi)
387	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
388
389	cfi_restore_state
390	cfi_remember_state
391	ALIGN (4)
392L(shl_1):
393	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
394	sub	$1, %eax
395	movaps	(%eax), %xmm1
396	xor	%edi, %edi
397	sub	$32, %ecx
398	movdqu	%xmm0, (%esi)
399	POP (%esi)
400L(shl_1_loop):
401
402	movdqa	16(%eax, %edi), %xmm2
403	sub	$32, %ecx
404	movdqa	32(%eax, %edi), %xmm3
405	movdqa	%xmm3, %xmm4
406	palignr	$1, %xmm2, %xmm3
407	palignr	$1, %xmm1, %xmm2
408	lea	32(%edi), %edi
409	movdqa	%xmm2, -32(%edx, %edi)
410	movdqa	%xmm3, -16(%edx, %edi)
411
412	jb	L(shl_1_end)
413
414	movdqa	16(%eax, %edi), %xmm2
415	sub	$32, %ecx
416	movdqa	32(%eax, %edi), %xmm3
417	movdqa	%xmm3, %xmm1
418	palignr	$1, %xmm2, %xmm3
419	palignr	$1, %xmm4, %xmm2
420	lea	32(%edi), %edi
421	movdqa	%xmm2, -32(%edx, %edi)
422	movdqa	%xmm3, -16(%edx, %edi)
423
424	jae	L(shl_1_loop)
425
426L(shl_1_end):
427	add	$32, %ecx
428	add	%ecx, %edi
429	add	%edi, %edx
430	lea	1(%edi, %eax), %eax
431	POP (%edi)
432	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
433
434	cfi_restore_state
435	cfi_remember_state
436	ALIGN (4)
437L(shl_2):
438	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
439	sub	$2, %eax
440	movaps	(%eax), %xmm1
441	xor	%edi, %edi
442	sub	$32, %ecx
443	movdqu	%xmm0, (%esi)
444	POP (%esi)
445L(shl_2_loop):
446
447	movdqa	16(%eax, %edi), %xmm2
448	sub	$32, %ecx
449	movdqa	32(%eax, %edi), %xmm3
450	movdqa	%xmm3, %xmm4
451	palignr	$2, %xmm2, %xmm3
452	palignr	$2, %xmm1, %xmm2
453	lea	32(%edi), %edi
454	movdqa	%xmm2, -32(%edx, %edi)
455	movdqa	%xmm3, -16(%edx, %edi)
456
457	jb	L(shl_2_end)
458
459	movdqa	16(%eax, %edi), %xmm2
460	sub	$32, %ecx
461	movdqa	32(%eax, %edi), %xmm3
462	movdqa	%xmm3, %xmm1
463	palignr	$2, %xmm2, %xmm3
464	palignr	$2, %xmm4, %xmm2
465	lea	32(%edi), %edi
466	movdqa	%xmm2, -32(%edx, %edi)
467	movdqa	%xmm3, -16(%edx, %edi)
468
469	jae	L(shl_2_loop)
470
471L(shl_2_end):
472	add	$32, %ecx
473	add	%ecx, %edi
474	add	%edi, %edx
475	lea	2(%edi, %eax), %eax
476	POP (%edi)
477	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
478
479	cfi_restore_state
480	cfi_remember_state
481	ALIGN (4)
482L(shl_3):
483	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
484	sub	$3, %eax
485	movaps	(%eax), %xmm1
486	xor	%edi, %edi
487	sub	$32, %ecx
488	movdqu	%xmm0, (%esi)
489	POP (%esi)
490L(shl_3_loop):
491
492	movdqa	16(%eax, %edi), %xmm2
493	sub	$32, %ecx
494	movdqa	32(%eax, %edi), %xmm3
495	movdqa	%xmm3, %xmm4
496	palignr	$3, %xmm2, %xmm3
497	palignr	$3, %xmm1, %xmm2
498	lea	32(%edi), %edi
499	movdqa	%xmm2, -32(%edx, %edi)
500	movdqa	%xmm3, -16(%edx, %edi)
501
502	jb	L(shl_3_end)
503
504	movdqa	16(%eax, %edi), %xmm2
505	sub	$32, %ecx
506	movdqa	32(%eax, %edi), %xmm3
507	movdqa	%xmm3, %xmm1
508	palignr	$3, %xmm2, %xmm3
509	palignr	$3, %xmm4, %xmm2
510	lea	32(%edi), %edi
511	movdqa	%xmm2, -32(%edx, %edi)
512	movdqa	%xmm3, -16(%edx, %edi)
513
514	jae	L(shl_3_loop)
515
516L(shl_3_end):
517	add	$32, %ecx
518	add	%ecx, %edi
519	add	%edi, %edx
520	lea	3(%edi, %eax), %eax
521	POP (%edi)
522	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
523
524	cfi_restore_state
525	cfi_remember_state
526	ALIGN (4)
527L(shl_4):
528	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
529	sub	$4, %eax
530	movaps	(%eax), %xmm1
531	xor	%edi, %edi
532	sub	$32, %ecx
533	movdqu	%xmm0, (%esi)
534	POP (%esi)
535L(shl_4_loop):
536
537	movdqa	16(%eax, %edi), %xmm2
538	sub	$32, %ecx
539	movdqa	32(%eax, %edi), %xmm3
540	movdqa	%xmm3, %xmm4
541	palignr	$4, %xmm2, %xmm3
542	palignr	$4, %xmm1, %xmm2
543	lea	32(%edi), %edi
544	movdqa	%xmm2, -32(%edx, %edi)
545	movdqa	%xmm3, -16(%edx, %edi)
546
547	jb	L(shl_4_end)
548
549	movdqa	16(%eax, %edi), %xmm2
550	sub	$32, %ecx
551	movdqa	32(%eax, %edi), %xmm3
552	movdqa	%xmm3, %xmm1
553	palignr	$4, %xmm2, %xmm3
554	palignr	$4, %xmm4, %xmm2
555	lea	32(%edi), %edi
556	movdqa	%xmm2, -32(%edx, %edi)
557	movdqa	%xmm3, -16(%edx, %edi)
558
559	jae	L(shl_4_loop)
560
561L(shl_4_end):
562	add	$32, %ecx
563	add	%ecx, %edi
564	add	%edi, %edx
565	lea	4(%edi, %eax), %eax
566	POP (%edi)
567	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
568
569	cfi_restore_state
570	cfi_remember_state
571	ALIGN (4)
572L(shl_5):
573	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
574	sub	$5, %eax
575	movaps	(%eax), %xmm1
576	xor	%edi, %edi
577	sub	$32, %ecx
578	movdqu	%xmm0, (%esi)
579	POP (%esi)
580L(shl_5_loop):
581
582	movdqa	16(%eax, %edi), %xmm2
583	sub	$32, %ecx
584	movdqa	32(%eax, %edi), %xmm3
585	movdqa	%xmm3, %xmm4
586	palignr	$5, %xmm2, %xmm3
587	palignr	$5, %xmm1, %xmm2
588	lea	32(%edi), %edi
589	movdqa	%xmm2, -32(%edx, %edi)
590	movdqa	%xmm3, -16(%edx, %edi)
591
592	jb	L(shl_5_end)
593
594	movdqa	16(%eax, %edi), %xmm2
595	sub	$32, %ecx
596	movdqa	32(%eax, %edi), %xmm3
597	movdqa	%xmm3, %xmm1
598	palignr	$5, %xmm2, %xmm3
599	palignr	$5, %xmm4, %xmm2
600	lea	32(%edi), %edi
601	movdqa	%xmm2, -32(%edx, %edi)
602	movdqa	%xmm3, -16(%edx, %edi)
603
604	jae	L(shl_5_loop)
605
606L(shl_5_end):
607	add	$32, %ecx
608	add	%ecx, %edi
609	add	%edi, %edx
610	lea	5(%edi, %eax), %eax
611	POP (%edi)
612	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
613
614	cfi_restore_state
615	cfi_remember_state
616	ALIGN (4)
617L(shl_6):
618	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
619	sub	$6, %eax
620	movaps	(%eax), %xmm1
621	xor	%edi, %edi
622	sub	$32, %ecx
623	movdqu	%xmm0, (%esi)
624	POP (%esi)
625L(shl_6_loop):
626
627	movdqa	16(%eax, %edi), %xmm2
628	sub	$32, %ecx
629	movdqa	32(%eax, %edi), %xmm3
630	movdqa	%xmm3, %xmm4
631	palignr	$6, %xmm2, %xmm3
632	palignr	$6, %xmm1, %xmm2
633	lea	32(%edi), %edi
634	movdqa	%xmm2, -32(%edx, %edi)
635	movdqa	%xmm3, -16(%edx, %edi)
636
637	jb	L(shl_6_end)
638
639	movdqa	16(%eax, %edi), %xmm2
640	sub	$32, %ecx
641	movdqa	32(%eax, %edi), %xmm3
642	movdqa	%xmm3, %xmm1
643	palignr	$6, %xmm2, %xmm3
644	palignr	$6, %xmm4, %xmm2
645	lea	32(%edi), %edi
646	movdqa	%xmm2, -32(%edx, %edi)
647	movdqa	%xmm3, -16(%edx, %edi)
648
649	jae	L(shl_6_loop)
650
651L(shl_6_end):
652	add	$32, %ecx
653	add	%ecx, %edi
654	add	%edi, %edx
655	lea	6(%edi, %eax), %eax
656	POP (%edi)
657	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
658
659	cfi_restore_state
660	cfi_remember_state
661	ALIGN (4)
662L(shl_7):
663	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
664	sub	$7, %eax
665	movaps	(%eax), %xmm1
666	xor	%edi, %edi
667	sub	$32, %ecx
668	movdqu	%xmm0, (%esi)
669	POP (%esi)
670L(shl_7_loop):
671
672	movdqa	16(%eax, %edi), %xmm2
673	sub	$32, %ecx
674	movdqa	32(%eax, %edi), %xmm3
675	movdqa	%xmm3, %xmm4
676	palignr	$7, %xmm2, %xmm3
677	palignr	$7, %xmm1, %xmm2
678	lea	32(%edi), %edi
679	movdqa	%xmm2, -32(%edx, %edi)
680	movdqa	%xmm3, -16(%edx, %edi)
681
682	jb	L(shl_7_end)
683
684	movdqa	16(%eax, %edi), %xmm2
685	sub	$32, %ecx
686	movdqa	32(%eax, %edi), %xmm3
687	movdqa	%xmm3, %xmm1
688	palignr	$7, %xmm2, %xmm3
689	palignr	$7, %xmm4, %xmm2
690	lea	32(%edi), %edi
691	movdqa	%xmm2, -32(%edx, %edi)
692	movdqa	%xmm3, -16(%edx, %edi)
693
694	jae	L(shl_7_loop)
695
696L(shl_7_end):
697	add	$32, %ecx
698	add	%ecx, %edi
699	add	%edi, %edx
700	lea	7(%edi, %eax), %eax
701	POP (%edi)
702	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
703
704	cfi_restore_state
705	cfi_remember_state
706	ALIGN (4)
707L(shl_8):
708	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
709	sub	$8, %eax
710	movaps	(%eax), %xmm1
711	xor	%edi, %edi
712	sub	$32, %ecx
713	movdqu	%xmm0, (%esi)
714	POP (%esi)
715L(shl_8_loop):
716
717	movdqa	16(%eax, %edi), %xmm2
718	sub	$32, %ecx
719	movdqa	32(%eax, %edi), %xmm3
720	movdqa	%xmm3, %xmm4
721	palignr	$8, %xmm2, %xmm3
722	palignr	$8, %xmm1, %xmm2
723	lea	32(%edi), %edi
724	movdqa	%xmm2, -32(%edx, %edi)
725	movdqa	%xmm3, -16(%edx, %edi)
726
727	jb	L(shl_8_end)
728
729	movdqa	16(%eax, %edi), %xmm2
730	sub	$32, %ecx
731	movdqa	32(%eax, %edi), %xmm3
732	movdqa	%xmm3, %xmm1
733	palignr	$8, %xmm2, %xmm3
734	palignr	$8, %xmm4, %xmm2
735	lea	32(%edi), %edi
736	movdqa	%xmm2, -32(%edx, %edi)
737	movdqa	%xmm3, -16(%edx, %edi)
738
739	jae	L(shl_8_loop)
740
741L(shl_8_end):
742	add	$32, %ecx
743	add	%ecx, %edi
744	add	%edi, %edx
745	lea	8(%edi, %eax), %eax
746	POP (%edi)
747	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
748
749	cfi_restore_state
750	cfi_remember_state
751	ALIGN (4)
752L(shl_9):
753	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
754	sub	$9, %eax
755	movaps	(%eax), %xmm1
756	xor	%edi, %edi
757	sub	$32, %ecx
758	movdqu	%xmm0, (%esi)
759	POP (%esi)
760L(shl_9_loop):
761
762	movdqa	16(%eax, %edi), %xmm2
763	sub	$32, %ecx
764	movdqa	32(%eax, %edi), %xmm3
765	movdqa	%xmm3, %xmm4
766	palignr	$9, %xmm2, %xmm3
767	palignr	$9, %xmm1, %xmm2
768	lea	32(%edi), %edi
769	movdqa	%xmm2, -32(%edx, %edi)
770	movdqa	%xmm3, -16(%edx, %edi)
771
772	jb	L(shl_9_end)
773
774	movdqa	16(%eax, %edi), %xmm2
775	sub	$32, %ecx
776	movdqa	32(%eax, %edi), %xmm3
777	movdqa	%xmm3, %xmm1
778	palignr	$9, %xmm2, %xmm3
779	palignr	$9, %xmm4, %xmm2
780	lea	32(%edi), %edi
781	movdqa	%xmm2, -32(%edx, %edi)
782	movdqa	%xmm3, -16(%edx, %edi)
783
784	jae	L(shl_9_loop)
785
786L(shl_9_end):
787	add	$32, %ecx
788	add	%ecx, %edi
789	add	%edi, %edx
790	lea	9(%edi, %eax), %eax
791	POP (%edi)
792	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
793
794	cfi_restore_state
795	cfi_remember_state
796	ALIGN (4)
797L(shl_10):
798	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
799	sub	$10, %eax
800	movaps	(%eax), %xmm1
801	xor	%edi, %edi
802	sub	$32, %ecx
803	movdqu	%xmm0, (%esi)
804	POP (%esi)
805L(shl_10_loop):
806
807	movdqa	16(%eax, %edi), %xmm2
808	sub	$32, %ecx
809	movdqa	32(%eax, %edi), %xmm3
810	movdqa	%xmm3, %xmm4
811	palignr	$10, %xmm2, %xmm3
812	palignr	$10, %xmm1, %xmm2
813	lea	32(%edi), %edi
814	movdqa	%xmm2, -32(%edx, %edi)
815	movdqa	%xmm3, -16(%edx, %edi)
816
817	jb	L(shl_10_end)
818
819	movdqa	16(%eax, %edi), %xmm2
820	sub	$32, %ecx
821	movdqa	32(%eax, %edi), %xmm3
822	movdqa	%xmm3, %xmm1
823	palignr	$10, %xmm2, %xmm3
824	palignr	$10, %xmm4, %xmm2
825	lea	32(%edi), %edi
826	movdqa	%xmm2, -32(%edx, %edi)
827	movdqa	%xmm3, -16(%edx, %edi)
828
829	jae	L(shl_10_loop)
830
831L(shl_10_end):
832	add	$32, %ecx
833	add	%ecx, %edi
834	add	%edi, %edx
835	lea	10(%edi, %eax), %eax
836	POP (%edi)
837	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
838
839	cfi_restore_state
840	cfi_remember_state
841	ALIGN (4)
842L(shl_11):
843	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
844	sub	$11, %eax
845	movaps	(%eax), %xmm1
846	xor	%edi, %edi
847	sub	$32, %ecx
848	movdqu	%xmm0, (%esi)
849	POP (%esi)
850L(shl_11_loop):
851
852	movdqa	16(%eax, %edi), %xmm2
853	sub	$32, %ecx
854	movdqa	32(%eax, %edi), %xmm3
855	movdqa	%xmm3, %xmm4
856	palignr	$11, %xmm2, %xmm3
857	palignr	$11, %xmm1, %xmm2
858	lea	32(%edi), %edi
859	movdqa	%xmm2, -32(%edx, %edi)
860	movdqa	%xmm3, -16(%edx, %edi)
861
862	jb	L(shl_11_end)
863
864	movdqa	16(%eax, %edi), %xmm2
865	sub	$32, %ecx
866	movdqa	32(%eax, %edi), %xmm3
867	movdqa	%xmm3, %xmm1
868	palignr	$11, %xmm2, %xmm3
869	palignr	$11, %xmm4, %xmm2
870	lea	32(%edi), %edi
871	movdqa	%xmm2, -32(%edx, %edi)
872	movdqa	%xmm3, -16(%edx, %edi)
873
874	jae	L(shl_11_loop)
875
876L(shl_11_end):
877	add	$32, %ecx
878	add	%ecx, %edi
879	add	%edi, %edx
880	lea	11(%edi, %eax), %eax
881	POP (%edi)
882	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
883
884	cfi_restore_state
885	cfi_remember_state
886	ALIGN (4)
887L(shl_12):
888	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
889	sub	$12, %eax
890	movaps	(%eax), %xmm1
891	xor	%edi, %edi
892	sub	$32, %ecx
893	movdqu	%xmm0, (%esi)
894	POP (%esi)
895L(shl_12_loop):
896
897	movdqa	16(%eax, %edi), %xmm2
898	sub	$32, %ecx
899	movdqa	32(%eax, %edi), %xmm3
900	movdqa	%xmm3, %xmm4
901	palignr	$12, %xmm2, %xmm3
902	palignr	$12, %xmm1, %xmm2
903	lea	32(%edi), %edi
904	movdqa	%xmm2, -32(%edx, %edi)
905	movdqa	%xmm3, -16(%edx, %edi)
906
907	jb	L(shl_12_end)
908
909	movdqa	16(%eax, %edi), %xmm2
910	sub	$32, %ecx
911	movdqa	32(%eax, %edi), %xmm3
912	movdqa	%xmm3, %xmm1
913	palignr	$12, %xmm2, %xmm3
914	palignr	$12, %xmm4, %xmm2
915	lea	32(%edi), %edi
916	movdqa	%xmm2, -32(%edx, %edi)
917	movdqa	%xmm3, -16(%edx, %edi)
918
919	jae	L(shl_12_loop)
920
921L(shl_12_end):
922	add	$32, %ecx
923	add	%ecx, %edi
924	add	%edi, %edx
925	lea	12(%edi, %eax), %eax
926	POP (%edi)
927	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
928
929	cfi_restore_state
930	cfi_remember_state
931	ALIGN (4)
932L(shl_13):
933	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
934	sub	$13, %eax
935	movaps	(%eax), %xmm1
936	xor	%edi, %edi
937	sub	$32, %ecx
938	movdqu	%xmm0, (%esi)
939	POP (%esi)
940L(shl_13_loop):
941
942	movdqa	16(%eax, %edi), %xmm2
943	sub	$32, %ecx
944	movdqa	32(%eax, %edi), %xmm3
945	movdqa	%xmm3, %xmm4
946	palignr	$13, %xmm2, %xmm3
947	palignr	$13, %xmm1, %xmm2
948	lea	32(%edi), %edi
949	movdqa	%xmm2, -32(%edx, %edi)
950	movdqa	%xmm3, -16(%edx, %edi)
951
952	jb	L(shl_13_end)
953
954	movdqa	16(%eax, %edi), %xmm2
955	sub	$32, %ecx
956	movdqa	32(%eax, %edi), %xmm3
957	movdqa	%xmm3, %xmm1
958	palignr	$13, %xmm2, %xmm3
959	palignr	$13, %xmm4, %xmm2
960	lea	32(%edi), %edi
961	movdqa	%xmm2, -32(%edx, %edi)
962	movdqa	%xmm3, -16(%edx, %edi)
963
964	jae	L(shl_13_loop)
965
966L(shl_13_end):
967	add	$32, %ecx
968	add	%ecx, %edi
969	add	%edi, %edx
970	lea	13(%edi, %eax), %eax
971	POP (%edi)
972	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
973
974	cfi_restore_state
975	cfi_remember_state
976	ALIGN (4)
977L(shl_14):
978	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
979	sub	$14, %eax
980	movaps	(%eax), %xmm1
981	xor	%edi, %edi
982	sub	$32, %ecx
983	movdqu	%xmm0, (%esi)
984	POP (%esi)
985L(shl_14_loop):
986
987	movdqa	16(%eax, %edi), %xmm2
988	sub	$32, %ecx
989	movdqa	32(%eax, %edi), %xmm3
990	movdqa	%xmm3, %xmm4
991	palignr	$14, %xmm2, %xmm3
992	palignr	$14, %xmm1, %xmm2
993	lea	32(%edi), %edi
994	movdqa	%xmm2, -32(%edx, %edi)
995	movdqa	%xmm3, -16(%edx, %edi)
996
997	jb	L(shl_14_end)
998
999	movdqa	16(%eax, %edi), %xmm2
1000	sub	$32, %ecx
1001	movdqa	32(%eax, %edi), %xmm3
1002	movdqa	%xmm3, %xmm1
1003	palignr	$14, %xmm2, %xmm3
1004	palignr	$14, %xmm4, %xmm2
1005	lea	32(%edi), %edi
1006	movdqa	%xmm2, -32(%edx, %edi)
1007	movdqa	%xmm3, -16(%edx, %edi)
1008
1009	jae	L(shl_14_loop)
1010
1011L(shl_14_end):
1012	add	$32, %ecx
1013	add	%ecx, %edi
1014	add	%edi, %edx
1015	lea	14(%edi, %eax), %eax
1016	POP (%edi)
1017	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1018
1019	cfi_restore_state
1020	cfi_remember_state
1021	ALIGN (4)
1022L(shl_15):
1023	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1024	sub	$15, %eax
1025	movaps	(%eax), %xmm1
1026	xor	%edi, %edi
1027	sub	$32, %ecx
1028	movdqu	%xmm0, (%esi)
1029	POP (%esi)
1030L(shl_15_loop):
1031
1032	movdqa	16(%eax, %edi), %xmm2
1033	sub	$32, %ecx
1034	movdqa	32(%eax, %edi), %xmm3
1035	movdqa	%xmm3, %xmm4
1036	palignr	$15, %xmm2, %xmm3
1037	palignr	$15, %xmm1, %xmm2
1038	lea	32(%edi), %edi
1039	movdqa	%xmm2, -32(%edx, %edi)
1040	movdqa	%xmm3, -16(%edx, %edi)
1041
1042	jb	L(shl_15_end)
1043
1044	movdqa	16(%eax, %edi), %xmm2
1045	sub	$32, %ecx
1046	movdqa	32(%eax, %edi), %xmm3
1047	movdqa	%xmm3, %xmm1
1048	palignr	$15, %xmm2, %xmm3
1049	palignr	$15, %xmm4, %xmm2
1050	lea	32(%edi), %edi
1051	movdqa	%xmm2, -32(%edx, %edi)
1052	movdqa	%xmm3, -16(%edx, %edi)
1053
1054	jae	L(shl_15_loop)
1055
1056L(shl_15_end):
1057	add	$32, %ecx
1058	add	%ecx, %edi
1059	add	%edi, %edx
1060	lea	15(%edi, %eax), %eax
1061	POP (%edi)
1062	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1063
1064
1065	ALIGN (4)
1066L(fwd_write_44bytes):
1067	movl	-44(%eax), %ecx
1068	movl	%ecx, -44(%edx)
1069L(fwd_write_40bytes):
1070	movl	-40(%eax), %ecx
1071	movl	%ecx, -40(%edx)
1072L(fwd_write_36bytes):
1073	movl	-36(%eax), %ecx
1074	movl	%ecx, -36(%edx)
1075L(fwd_write_32bytes):
1076	movl	-32(%eax), %ecx
1077	movl	%ecx, -32(%edx)
1078L(fwd_write_28bytes):
1079	movl	-28(%eax), %ecx
1080	movl	%ecx, -28(%edx)
1081L(fwd_write_24bytes):
1082	movl	-24(%eax), %ecx
1083	movl	%ecx, -24(%edx)
1084L(fwd_write_20bytes):
1085	movl	-20(%eax), %ecx
1086	movl	%ecx, -20(%edx)
1087L(fwd_write_16bytes):
1088	movl	-16(%eax), %ecx
1089	movl	%ecx, -16(%edx)
1090L(fwd_write_12bytes):
1091	movl	-12(%eax), %ecx
1092	movl	%ecx, -12(%edx)
1093L(fwd_write_8bytes):
1094	movl	-8(%eax), %ecx
1095	movl	%ecx, -8(%edx)
1096L(fwd_write_4bytes):
1097	movl	-4(%eax), %ecx
1098	movl	%ecx, -4(%edx)
1099L(fwd_write_0bytes):
1100#ifndef USE_AS_BCOPY
1101# ifdef USE_AS_MEMPCPY
1102	movl	%edx, %eax
1103# else
1104	movl	DEST(%esp), %eax
1105# endif
1106#endif
1107	RETURN
1108
1109	ALIGN (4)
1110L(fwd_write_5bytes):
1111	movl	-5(%eax), %ecx
1112	movl	-4(%eax), %eax
1113	movl	%ecx, -5(%edx)
1114	movl	%eax, -4(%edx)
1115#ifndef USE_AS_BCOPY
1116# ifdef USE_AS_MEMPCPY
1117	movl	%edx, %eax
1118# else
1119	movl	DEST(%esp), %eax
1120# endif
1121#endif
1122	RETURN
1123
1124	ALIGN (4)
1125L(fwd_write_45bytes):
1126	movl	-45(%eax), %ecx
1127	movl	%ecx, -45(%edx)
1128L(fwd_write_41bytes):
1129	movl	-41(%eax), %ecx
1130	movl	%ecx, -41(%edx)
1131L(fwd_write_37bytes):
1132	movl	-37(%eax), %ecx
1133	movl	%ecx, -37(%edx)
1134L(fwd_write_33bytes):
1135	movl	-33(%eax), %ecx
1136	movl	%ecx, -33(%edx)
1137L(fwd_write_29bytes):
1138	movl	-29(%eax), %ecx
1139	movl	%ecx, -29(%edx)
1140L(fwd_write_25bytes):
1141	movl	-25(%eax), %ecx
1142	movl	%ecx, -25(%edx)
1143L(fwd_write_21bytes):
1144	movl	-21(%eax), %ecx
1145	movl	%ecx, -21(%edx)
1146L(fwd_write_17bytes):
1147	movl	-17(%eax), %ecx
1148	movl	%ecx, -17(%edx)
1149L(fwd_write_13bytes):
1150	movl	-13(%eax), %ecx
1151	movl	%ecx, -13(%edx)
1152L(fwd_write_9bytes):
1153	movl	-9(%eax), %ecx
1154	movl	%ecx, -9(%edx)
1155	movl	-5(%eax), %ecx
1156	movl	%ecx, -5(%edx)
1157L(fwd_write_1bytes):
1158	movzbl	-1(%eax), %ecx
1159	movb	%cl, -1(%edx)
1160#ifndef USE_AS_BCOPY
1161# ifdef USE_AS_MEMPCPY
1162	movl	%edx, %eax
1163# else
1164	movl	DEST(%esp), %eax
1165# endif
1166#endif
1167	RETURN
1168
1169	ALIGN (4)
1170L(fwd_write_46bytes):
1171	movl	-46(%eax), %ecx
1172	movl	%ecx, -46(%edx)
1173L(fwd_write_42bytes):
1174	movl	-42(%eax), %ecx
1175	movl	%ecx, -42(%edx)
1176L(fwd_write_38bytes):
1177	movl	-38(%eax), %ecx
1178	movl	%ecx, -38(%edx)
1179L(fwd_write_34bytes):
1180	movl	-34(%eax), %ecx
1181	movl	%ecx, -34(%edx)
1182L(fwd_write_30bytes):
1183	movl	-30(%eax), %ecx
1184	movl	%ecx, -30(%edx)
1185L(fwd_write_26bytes):
1186	movl	-26(%eax), %ecx
1187	movl	%ecx, -26(%edx)
1188L(fwd_write_22bytes):
1189	movl	-22(%eax), %ecx
1190	movl	%ecx, -22(%edx)
1191L(fwd_write_18bytes):
1192	movl	-18(%eax), %ecx
1193	movl	%ecx, -18(%edx)
1194L(fwd_write_14bytes):
1195	movl	-14(%eax), %ecx
1196	movl	%ecx, -14(%edx)
1197L(fwd_write_10bytes):
1198	movl	-10(%eax), %ecx
1199	movl	%ecx, -10(%edx)
1200L(fwd_write_6bytes):
1201	movl	-6(%eax), %ecx
1202	movl	%ecx, -6(%edx)
1203L(fwd_write_2bytes):
1204	movzwl	-2(%eax), %ecx
1205	movw	%cx, -2(%edx)
1206#ifndef USE_AS_BCOPY
1207# ifdef USE_AS_MEMPCPY
1208	movl	%edx, %eax
1209# else
1210	movl	DEST(%esp), %eax
1211# endif
1212#endif
1213	RETURN
1214
1215	ALIGN (4)
1216L(fwd_write_47bytes):
1217	movl	-47(%eax), %ecx
1218	movl	%ecx, -47(%edx)
1219L(fwd_write_43bytes):
1220	movl	-43(%eax), %ecx
1221	movl	%ecx, -43(%edx)
1222L(fwd_write_39bytes):
1223	movl	-39(%eax), %ecx
1224	movl	%ecx, -39(%edx)
1225L(fwd_write_35bytes):
1226	movl	-35(%eax), %ecx
1227	movl	%ecx, -35(%edx)
1228L(fwd_write_31bytes):
1229	movl	-31(%eax), %ecx
1230	movl	%ecx, -31(%edx)
1231L(fwd_write_27bytes):
1232	movl	-27(%eax), %ecx
1233	movl	%ecx, -27(%edx)
1234L(fwd_write_23bytes):
1235	movl	-23(%eax), %ecx
1236	movl	%ecx, -23(%edx)
1237L(fwd_write_19bytes):
1238	movl	-19(%eax), %ecx
1239	movl	%ecx, -19(%edx)
1240L(fwd_write_15bytes):
1241	movl	-15(%eax), %ecx
1242	movl	%ecx, -15(%edx)
1243L(fwd_write_11bytes):
1244	movl	-11(%eax), %ecx
1245	movl	%ecx, -11(%edx)
1246L(fwd_write_7bytes):
1247	movl	-7(%eax), %ecx
1248	movl	%ecx, -7(%edx)
1249L(fwd_write_3bytes):
1250	movzwl	-3(%eax), %ecx
1251	movzbl	-1(%eax), %eax
1252	movw	%cx, -3(%edx)
1253	movb	%al, -1(%edx)
1254#ifndef USE_AS_BCOPY
1255# ifdef USE_AS_MEMPCPY
1256	movl	%edx, %eax
1257# else
1258	movl	DEST(%esp), %eax
1259# endif
1260#endif
1261	RETURN_END
1262
1263	cfi_restore_state
1264	cfi_remember_state
1265	ALIGN (4)
1266L(large_page):
1267	movdqu	(%eax), %xmm1
1268	movdqu	%xmm0, (%esi)
1269	movntdq	%xmm1, (%edx)
1270	add	$0x10, %eax
1271	add	$0x10, %edx
1272	sub	$0x10, %ecx
1273	cmp	%al, %dl
1274	je	L(copy_page_by_rep)
1275L(large_page_loop_init):
1276	POP (%esi)
1277	sub	$0x80, %ecx
1278	POP (%edi)
1279L(large_page_loop):
1280	prefetchnta	0x1c0(%eax)
1281	prefetchnta	0x280(%eax)
1282	movdqu	(%eax), %xmm0
1283	movdqu	0x10(%eax), %xmm1
1284	movdqu	0x20(%eax), %xmm2
1285	movdqu	0x30(%eax), %xmm3
1286	movdqu	0x40(%eax), %xmm4
1287	movdqu	0x50(%eax), %xmm5
1288	movdqu	0x60(%eax), %xmm6
1289	movdqu	0x70(%eax), %xmm7
1290	lea	0x80(%eax), %eax
1291	lfence
1292	sub	$0x80, %ecx
1293	movntdq	%xmm0, (%edx)
1294	movntdq	%xmm1, 0x10(%edx)
1295	movntdq	%xmm2, 0x20(%edx)
1296	movntdq	%xmm3, 0x30(%edx)
1297	movntdq	%xmm4, 0x40(%edx)
1298	movntdq	%xmm5, 0x50(%edx)
1299	movntdq	%xmm6, 0x60(%edx)
1300	movntdq	%xmm7, 0x70(%edx)
1301	lea	0x80(%edx), %edx
1302	jae	L(large_page_loop)
1303	add	$0x80, %ecx
1304	cmp	$0x40, %ecx
1305	jb	L(large_page_less_64bytes)
1306
1307	movdqu	(%eax), %xmm0
1308	movdqu	0x10(%eax), %xmm1
1309	movdqu	0x20(%eax), %xmm2
1310	movdqu	0x30(%eax), %xmm3
1311	lea	0x40(%eax), %eax
1312
1313	movntdq	%xmm0, (%edx)
1314	movntdq	%xmm1, 0x10(%edx)
1315	movntdq	%xmm2, 0x20(%edx)
1316	movntdq	%xmm3, 0x30(%edx)
1317	lea	0x40(%edx), %edx
1318	sub	$0x40, %ecx
1319L(large_page_less_64bytes):
1320	cmp	$32, %ecx
1321	jb	L(large_page_less_32bytes)
1322	movdqu	(%eax), %xmm0
1323	movdqu	0x10(%eax), %xmm1
1324	lea	0x20(%eax), %eax
1325	movntdq	%xmm0, (%edx)
1326	movntdq	%xmm1, 0x10(%edx)
1327	lea	0x20(%edx), %edx
1328	sub	$0x20, %ecx
1329L(large_page_less_32bytes):
1330	add	%ecx, %edx
1331	add	%ecx, %eax
1332	sfence
1333	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1334
1335	cfi_restore_state
1336	cfi_remember_state
1337	ALIGN (4)
1338L(copy_page_by_rep):
1339	mov	%eax, %esi
1340	mov	%edx, %edi
1341	mov	%ecx, %edx
1342	shr	$2, %ecx
1343	and	$3, %edx
1344	rep	movsl
1345	jz	L(copy_page_by_rep_exit)
1346	cmp	$2, %edx
1347	jb	L(copy_page_by_rep_left_1)
1348	movzwl	(%esi), %eax
1349	movw	%ax, (%edi)
1350	add	$2, %esi
1351	add	$2, %edi
1352	sub	$2, %edx
1353	jz	L(copy_page_by_rep_exit)
1354L(copy_page_by_rep_left_1):
1355	movzbl	(%esi), %eax
1356	movb	%al, (%edi)
1357L(copy_page_by_rep_exit):
1358	POP (%esi)
1359	POP (%edi)
1360#ifndef USE_AS_BCOPY
1361	movl	DEST(%esp), %eax
1362# ifdef USE_AS_MEMPCPY
1363	movl	LEN(%esp), %ecx
1364	add	%ecx, %eax
1365# endif
1366#endif
1367	RETURN
1368
1369	ALIGN (4)
1370L(bk_write_44bytes):
1371	movl	40(%eax), %ecx
1372	movl	%ecx, 40(%edx)
1373L(bk_write_40bytes):
1374	movl	36(%eax), %ecx
1375	movl	%ecx, 36(%edx)
1376L(bk_write_36bytes):
1377	movl	32(%eax), %ecx
1378	movl	%ecx, 32(%edx)
1379L(bk_write_32bytes):
1380	movl	28(%eax), %ecx
1381	movl	%ecx, 28(%edx)
1382L(bk_write_28bytes):
1383	movl	24(%eax), %ecx
1384	movl	%ecx, 24(%edx)
1385L(bk_write_24bytes):
1386	movl	20(%eax), %ecx
1387	movl	%ecx, 20(%edx)
1388L(bk_write_20bytes):
1389	movl	16(%eax), %ecx
1390	movl	%ecx, 16(%edx)
1391L(bk_write_16bytes):
1392	movl	12(%eax), %ecx
1393	movl	%ecx, 12(%edx)
1394L(bk_write_12bytes):
1395	movl	8(%eax), %ecx
1396	movl	%ecx, 8(%edx)
1397L(bk_write_8bytes):
1398	movl	4(%eax), %ecx
1399	movl	%ecx, 4(%edx)
1400L(bk_write_4bytes):
1401	movl	(%eax), %ecx
1402	movl	%ecx, (%edx)
1403L(bk_write_0bytes):
1404#ifndef USE_AS_BCOPY
1405	movl	DEST(%esp), %eax
1406# ifdef USE_AS_MEMPCPY
1407	movl	LEN(%esp), %ecx
1408	add	%ecx, %eax
1409# endif
1410#endif
1411	RETURN
1412
1413	ALIGN (4)
1414L(bk_write_45bytes):
1415	movl	41(%eax), %ecx
1416	movl	%ecx, 41(%edx)
1417L(bk_write_41bytes):
1418	movl	37(%eax), %ecx
1419	movl	%ecx, 37(%edx)
1420L(bk_write_37bytes):
1421	movl	33(%eax), %ecx
1422	movl	%ecx, 33(%edx)
1423L(bk_write_33bytes):
1424	movl	29(%eax), %ecx
1425	movl	%ecx, 29(%edx)
1426L(bk_write_29bytes):
1427	movl	25(%eax), %ecx
1428	movl	%ecx, 25(%edx)
1429L(bk_write_25bytes):
1430	movl	21(%eax), %ecx
1431	movl	%ecx, 21(%edx)
1432L(bk_write_21bytes):
1433	movl	17(%eax), %ecx
1434	movl	%ecx, 17(%edx)
1435L(bk_write_17bytes):
1436	movl	13(%eax), %ecx
1437	movl	%ecx, 13(%edx)
1438L(bk_write_13bytes):
1439	movl	9(%eax), %ecx
1440	movl	%ecx, 9(%edx)
1441L(bk_write_9bytes):
1442	movl	5(%eax), %ecx
1443	movl	%ecx, 5(%edx)
1444L(bk_write_5bytes):
1445	movl	1(%eax), %ecx
1446	movl	%ecx, 1(%edx)
1447L(bk_write_1bytes):
1448	movzbl	(%eax), %ecx
1449	movb	%cl, (%edx)
1450#ifndef USE_AS_BCOPY
1451	movl	DEST(%esp), %eax
1452# ifdef USE_AS_MEMPCPY
1453	movl	LEN(%esp), %ecx
1454	add	%ecx, %eax
1455# endif
1456#endif
1457	RETURN
1458
1459	ALIGN (4)
1460L(bk_write_46bytes):
1461	movl	42(%eax), %ecx
1462	movl	%ecx, 42(%edx)
1463L(bk_write_42bytes):
1464	movl	38(%eax), %ecx
1465	movl	%ecx, 38(%edx)
1466L(bk_write_38bytes):
1467	movl	34(%eax), %ecx
1468	movl	%ecx, 34(%edx)
1469L(bk_write_34bytes):
1470	movl	30(%eax), %ecx
1471	movl	%ecx, 30(%edx)
1472L(bk_write_30bytes):
1473	movl	26(%eax), %ecx
1474	movl	%ecx, 26(%edx)
1475L(bk_write_26bytes):
1476	movl	22(%eax), %ecx
1477	movl	%ecx, 22(%edx)
1478L(bk_write_22bytes):
1479	movl	18(%eax), %ecx
1480	movl	%ecx, 18(%edx)
1481L(bk_write_18bytes):
1482	movl	14(%eax), %ecx
1483	movl	%ecx, 14(%edx)
1484L(bk_write_14bytes):
1485	movl	10(%eax), %ecx
1486	movl	%ecx, 10(%edx)
1487L(bk_write_10bytes):
1488	movl	6(%eax), %ecx
1489	movl	%ecx, 6(%edx)
1490L(bk_write_6bytes):
1491	movl	2(%eax), %ecx
1492	movl	%ecx, 2(%edx)
1493L(bk_write_2bytes):
1494	movzwl	(%eax), %ecx
1495	movw	%cx, (%edx)
1496#ifndef USE_AS_BCOPY
1497	movl	DEST(%esp), %eax
1498# ifdef USE_AS_MEMPCPY
1499	movl	LEN(%esp), %ecx
1500	add	%ecx, %eax
1501# endif
1502#endif
1503	RETURN
1504
1505	ALIGN (4)
1506L(bk_write_47bytes):
1507	movl	43(%eax), %ecx
1508	movl	%ecx, 43(%edx)
1509L(bk_write_43bytes):
1510	movl	39(%eax), %ecx
1511	movl	%ecx, 39(%edx)
1512L(bk_write_39bytes):
1513	movl	35(%eax), %ecx
1514	movl	%ecx, 35(%edx)
1515L(bk_write_35bytes):
1516	movl	31(%eax), %ecx
1517	movl	%ecx, 31(%edx)
1518L(bk_write_31bytes):
1519	movl	27(%eax), %ecx
1520	movl	%ecx, 27(%edx)
1521L(bk_write_27bytes):
1522	movl	23(%eax), %ecx
1523	movl	%ecx, 23(%edx)
1524L(bk_write_23bytes):
1525	movl	19(%eax), %ecx
1526	movl	%ecx, 19(%edx)
1527L(bk_write_19bytes):
1528	movl	15(%eax), %ecx
1529	movl	%ecx, 15(%edx)
1530L(bk_write_15bytes):
1531	movl	11(%eax), %ecx
1532	movl	%ecx, 11(%edx)
1533L(bk_write_11bytes):
1534	movl	7(%eax), %ecx
1535	movl	%ecx, 7(%edx)
1536L(bk_write_7bytes):
1537	movl	3(%eax), %ecx
1538	movl	%ecx, 3(%edx)
1539L(bk_write_3bytes):
1540	movzwl	1(%eax), %ecx
1541	movw	%cx, 1(%edx)
1542	movzbl	(%eax), %eax
1543	movb	%al, (%edx)
1544#ifndef USE_AS_BCOPY
1545	movl	DEST(%esp), %eax
1546# ifdef USE_AS_MEMPCPY
1547	movl	LEN(%esp), %ecx
1548	add	%ecx, %eax
1549# endif
1550#endif
1551	RETURN_END
1552
1553
1554	.pushsection .rodata.ssse3,"a",@progbits
1555	ALIGN (2)
1556L(table_48bytes_fwd):
1557	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1558	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1559	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1560	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1561	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1562	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1563	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1564	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1565	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1566	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1567	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1568	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1569	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1570	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1571	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1572	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1573	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1574	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1575	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1576	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1577	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1578	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1579	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1580	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1581	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1582	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1583	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1584	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1585	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1586	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1587	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1588	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1589	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1590	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1591	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1592	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1593	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1594	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1595	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1596	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1597	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1598	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1599	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1600	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1601	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1602	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1603	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1604	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1605
1606	ALIGN (2)
1607L(shl_table):
1608	.int	JMPTBL (L(shl_0), L(shl_table))
1609	.int	JMPTBL (L(shl_1), L(shl_table))
1610	.int	JMPTBL (L(shl_2), L(shl_table))
1611	.int	JMPTBL (L(shl_3), L(shl_table))
1612	.int	JMPTBL (L(shl_4), L(shl_table))
1613	.int	JMPTBL (L(shl_5), L(shl_table))
1614	.int	JMPTBL (L(shl_6), L(shl_table))
1615	.int	JMPTBL (L(shl_7), L(shl_table))
1616	.int	JMPTBL (L(shl_8), L(shl_table))
1617	.int	JMPTBL (L(shl_9), L(shl_table))
1618	.int	JMPTBL (L(shl_10), L(shl_table))
1619	.int	JMPTBL (L(shl_11), L(shl_table))
1620	.int	JMPTBL (L(shl_12), L(shl_table))
1621	.int	JMPTBL (L(shl_13), L(shl_table))
1622	.int	JMPTBL (L(shl_14), L(shl_table))
1623	.int	JMPTBL (L(shl_15), L(shl_table))
1624
1625	ALIGN (2)
1626L(table_48_bytes_bwd):
1627	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1628	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1629	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1630	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1631	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1632	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1633	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1634	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1635	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1636	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1637	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1638	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1639	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1640	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1641	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1642	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1643	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1644	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1645	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1646	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1647	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1648	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1649	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1650	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1651	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1652	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1653	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1654	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1655	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1656	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1657	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1658	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1659	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1660	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1661	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1662	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1663	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1664	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1665	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1666	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1667	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1668	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1669	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1670	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1671	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1672	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1673	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1674	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1675
1676	.popsection
1677
1678#ifdef USE_AS_MEMMOVE
1679	ALIGN (4)
1680L(copy_backward):
1681	PUSH (%esi)
1682	movl	%eax, %esi
1683	add	%ecx, %edx
1684	add	%ecx, %esi
1685	testl	$0x3, %edx
1686	jnz	L(bk_align)
1687
1688L(bk_aligned_4):
1689	cmp	$64, %ecx
1690	jae	L(bk_write_more64bytes)
1691
1692L(bk_write_64bytesless):
1693	cmp	$32, %ecx
1694	jb	L(bk_write_less32bytes)
1695
1696L(bk_write_more32bytes):
1697	/* Copy 32 bytes at a time.  */
1698	sub	$32, %ecx
1699	movl	-4(%esi), %eax
1700	movl	%eax, -4(%edx)
1701	movl	-8(%esi), %eax
1702	movl	%eax, -8(%edx)
1703	movl	-12(%esi), %eax
1704	movl	%eax, -12(%edx)
1705	movl	-16(%esi), %eax
1706	movl	%eax, -16(%edx)
1707	movl	-20(%esi), %eax
1708	movl	%eax, -20(%edx)
1709	movl	-24(%esi), %eax
1710	movl	%eax, -24(%edx)
1711	movl	-28(%esi), %eax
1712	movl	%eax, -28(%edx)
1713	movl	-32(%esi), %eax
1714	movl	%eax, -32(%edx)
1715	sub	$32, %edx
1716	sub	$32, %esi
1717
1718L(bk_write_less32bytes):
1719	movl	%esi, %eax
1720	sub	%ecx, %edx
1721	sub	%ecx, %eax
1722	POP (%esi)
1723L(bk_write_less48bytes):
1724	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1725
1726	CFI_PUSH (%esi)
1727	ALIGN (4)
1728L(bk_align):
1729	cmp	$8, %ecx
1730	jbe	L(bk_write_less32bytes)
1731	testl	$1, %edx
1732	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1733	   then (EDX & 2) must be != 0.  */
1734	jz	L(bk_got2)
1735	sub	$1, %esi
1736	sub	$1, %ecx
1737	sub	$1, %edx
1738	movzbl	(%esi), %eax
1739	movb	%al, (%edx)
1740
1741	testl	$2, %edx
1742	jz	L(bk_aligned_4)
1743
1744L(bk_got2):
1745	sub	$2, %esi
1746	sub	$2, %ecx
1747	sub	$2, %edx
1748	movzwl	(%esi), %eax
1749	movw	%ax, (%edx)
1750	jmp	L(bk_aligned_4)
1751
1752	ALIGN (4)
1753L(bk_write_more64bytes):
1754	/* Check alignment of last byte.  */
1755	testl	$15, %edx
1756	jz	L(bk_ssse3_cpy_pre)
1757
1758/* EDX is aligned 4 bytes, but not 16 bytes.  */
1759L(bk_ssse3_align):
1760	sub	$4, %esi
1761	sub	$4, %ecx
1762	sub	$4, %edx
1763	movl	(%esi), %eax
1764	movl	%eax, (%edx)
1765
1766	testl	$15, %edx
1767	jz	L(bk_ssse3_cpy_pre)
1768
1769	sub	$4, %esi
1770	sub	$4, %ecx
1771	sub	$4, %edx
1772	movl	(%esi), %eax
1773	movl	%eax, (%edx)
1774
1775	testl	$15, %edx
1776	jz	L(bk_ssse3_cpy_pre)
1777
1778	sub	$4, %esi
1779	sub	$4, %ecx
1780	sub	$4, %edx
1781	movl	(%esi), %eax
1782	movl	%eax, (%edx)
1783
1784L(bk_ssse3_cpy_pre):
1785	cmp	$64, %ecx
1786	jb	L(bk_write_more32bytes)
1787
1788L(bk_ssse3_cpy):
1789	sub	$64, %esi
1790	sub	$64, %ecx
1791	sub	$64, %edx
1792	movdqu	0x30(%esi), %xmm3
1793	movdqa	%xmm3, 0x30(%edx)
1794	movdqu	0x20(%esi), %xmm2
1795	movdqa	%xmm2, 0x20(%edx)
1796	movdqu	0x10(%esi), %xmm1
1797	movdqa	%xmm1, 0x10(%edx)
1798	movdqu	(%esi), %xmm0
1799	movdqa	%xmm0, (%edx)
1800	cmp	$64, %ecx
1801	jae	L(bk_ssse3_cpy)
1802	jmp	L(bk_write_64bytesless)
1803
1804#endif
1805
1806END (MEMCPY)
1807
1808#endif
1809