1/* strcpy with AVX2
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# ifndef USE_AS_STRCAT
22#  include <sysdep.h>
23
24#  ifndef STRCPY
25#   define STRCPY  __strcpy_avx2
26#  endif
27
28# endif
29
30/* Number of bytes in a vector register */
31# ifndef VEC_SIZE
32#  define VEC_SIZE	32
33# endif
34
35# ifndef VZEROUPPER
36#  define VZEROUPPER	vzeroupper
37# endif
38
39# ifndef SECTION
40#  define SECTION(p)	p##.avx
41# endif
42
43/* zero register */
44#define xmmZ	xmm0
45#define ymmZ	ymm0
46
47/* mask register */
48#define ymmM	ymm1
49
50# ifndef USE_AS_STRCAT
51
52	.section SECTION(.text),"ax",@progbits
53ENTRY (STRCPY)
54#  ifdef USE_AS_STRNCPY
55	mov	%RDX_LP, %R8_LP
56	test	%R8_LP, %R8_LP
57	jz	L(ExitZero)
58#  endif
59	mov	%rsi, %rcx
60#  ifndef USE_AS_STPCPY
61	mov	%rdi, %rax      /* save result */
62#  endif
63
64# endif
65
66	vpxor	%xmmZ, %xmmZ, %xmmZ
67
68	and	$((VEC_SIZE * 4) - 1), %ecx
69	cmp	$(VEC_SIZE * 2), %ecx
70	jbe	L(SourceStringAlignmentLessTwoVecSize)
71
72	and	$-VEC_SIZE, %rsi
73	and	$(VEC_SIZE - 1), %ecx
74
75	vpcmpeqb (%rsi), %ymmZ, %ymmM
76	vpmovmskb %ymmM, %edx
77	shr	%cl, %rdx
78
79# ifdef USE_AS_STRNCPY
80#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
81	mov	$VEC_SIZE, %r10
82	sub	%rcx, %r10
83	cmp	%r10, %r8
84#  else
85	mov	$(VEC_SIZE + 1), %r10
86	sub	%rcx, %r10
87	cmp	%r10, %r8
88#  endif
89	jbe	L(CopyVecSizeTailCase2OrCase3)
90# endif
91	test	%edx, %edx
92	jnz	L(CopyVecSizeTail)
93
94	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
95	vpmovmskb %ymm2, %edx
96
97# ifdef USE_AS_STRNCPY
98	add	$VEC_SIZE, %r10
99	cmp	%r10, %r8
100	jbe	L(CopyTwoVecSizeCase2OrCase3)
101# endif
102	test	%edx, %edx
103	jnz	L(CopyTwoVecSize)
104
105	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
106	vmovdqu %ymm2, (%rdi)
107
108/* If source address alignment != destination address alignment */
109	.p2align 4
110L(UnalignVecSizeBoth):
111	sub	%rcx, %rdi
112# ifdef USE_AS_STRNCPY
113	add	%rcx, %r8
114	sbb	%rcx, %rcx
115	or	%rcx, %r8
116# endif
117	mov	$VEC_SIZE, %rcx
118	vmovdqa (%rsi, %rcx), %ymm2
119	vmovdqu %ymm2, (%rdi, %rcx)
120	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
121	vpcmpeqb %ymm2, %ymmZ, %ymmM
122	vpmovmskb %ymmM, %edx
123	add	$VEC_SIZE, %rcx
124# ifdef USE_AS_STRNCPY
125	sub	$(VEC_SIZE * 3), %r8
126	jbe	L(CopyVecSizeCase2OrCase3)
127# endif
128	test	%edx, %edx
129# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
130	jnz	L(CopyVecSizeUnalignedVec2)
131# else
132	jnz	L(CopyVecSize)
133# endif
134
135	vmovdqu %ymm2, (%rdi, %rcx)
136	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
137	vpcmpeqb %ymm3, %ymmZ, %ymmM
138	vpmovmskb %ymmM, %edx
139	add	$VEC_SIZE, %rcx
140# ifdef USE_AS_STRNCPY
141	sub	$VEC_SIZE, %r8
142	jbe	L(CopyVecSizeCase2OrCase3)
143# endif
144	test	%edx, %edx
145# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
146	jnz	L(CopyVecSizeUnalignedVec3)
147# else
148	jnz	L(CopyVecSize)
149# endif
150
151	vmovdqu %ymm3, (%rdi, %rcx)
152	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
153	vpcmpeqb %ymm4, %ymmZ, %ymmM
154	vpmovmskb %ymmM, %edx
155	add	$VEC_SIZE, %rcx
156# ifdef USE_AS_STRNCPY
157	sub	$VEC_SIZE, %r8
158	jbe	L(CopyVecSizeCase2OrCase3)
159# endif
160	test	%edx, %edx
161# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
162	jnz	L(CopyVecSizeUnalignedVec4)
163# else
164	jnz	L(CopyVecSize)
165# endif
166
167	vmovdqu %ymm4, (%rdi, %rcx)
168	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
169	vpcmpeqb %ymm2, %ymmZ, %ymmM
170	vpmovmskb %ymmM, %edx
171	add	$VEC_SIZE, %rcx
172# ifdef USE_AS_STRNCPY
173	sub	$VEC_SIZE, %r8
174	jbe	L(CopyVecSizeCase2OrCase3)
175# endif
176	test	%edx, %edx
177# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
178	jnz	L(CopyVecSizeUnalignedVec2)
179# else
180	jnz	L(CopyVecSize)
181# endif
182
183	vmovdqu %ymm2, (%rdi, %rcx)
184	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
185	vpcmpeqb %ymm2, %ymmZ, %ymmM
186	vpmovmskb %ymmM, %edx
187	add	$VEC_SIZE, %rcx
188# ifdef USE_AS_STRNCPY
189	sub	$VEC_SIZE, %r8
190	jbe	L(CopyVecSizeCase2OrCase3)
191# endif
192	test	%edx, %edx
193# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
194	jnz	L(CopyVecSizeUnalignedVec2)
195# else
196	jnz	L(CopyVecSize)
197# endif
198
199	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
200	vmovdqu %ymm2, (%rdi, %rcx)
201	vpcmpeqb %ymm3, %ymmZ, %ymmM
202	vpmovmskb %ymmM, %edx
203	add	$VEC_SIZE, %rcx
204# ifdef USE_AS_STRNCPY
205	sub	$VEC_SIZE, %r8
206	jbe	L(CopyVecSizeCase2OrCase3)
207# endif
208	test	%edx, %edx
209# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
210	jnz	L(CopyVecSizeUnalignedVec3)
211# else
212	jnz	L(CopyVecSize)
213# endif
214
215	vmovdqu %ymm3, (%rdi, %rcx)
216	mov	%rsi, %rdx
217	lea	VEC_SIZE(%rsi, %rcx), %rsi
218	and	$-(VEC_SIZE * 4), %rsi
219	sub	%rsi, %rdx
220	sub	%rdx, %rdi
221# ifdef USE_AS_STRNCPY
222	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
223# endif
224L(UnalignedFourVecSizeLoop):
225	vmovdqa (%rsi), %ymm4
226	vmovdqa VEC_SIZE(%rsi), %ymm5
227	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
228	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
229	vpminub %ymm5, %ymm4, %ymm2
230	vpminub %ymm7, %ymm6, %ymm3
231	vpminub %ymm2, %ymm3, %ymm3
232	vpcmpeqb %ymmM, %ymm3, %ymm3
233	vpmovmskb %ymm3, %edx
234# ifdef USE_AS_STRNCPY
235	sub	$(VEC_SIZE * 4), %r8
236	jbe	L(UnalignedLeaveCase2OrCase3)
237# endif
238	test	%edx, %edx
239	jnz	L(UnalignedFourVecSizeLeave)
240
241L(UnalignedFourVecSizeLoop_start):
242	add	$(VEC_SIZE * 4), %rdi
243	add	$(VEC_SIZE * 4), %rsi
244	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
245	vmovdqa (%rsi), %ymm4
246	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
247	vmovdqa VEC_SIZE(%rsi), %ymm5
248	vpminub %ymm5, %ymm4, %ymm2
249	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
250	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
251	vmovdqu %ymm7, -VEC_SIZE(%rdi)
252	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
253	vpminub %ymm7, %ymm6, %ymm3
254	vpminub %ymm2, %ymm3, %ymm3
255	vpcmpeqb %ymmM, %ymm3, %ymm3
256	vpmovmskb %ymm3, %edx
257# ifdef USE_AS_STRNCPY
258	sub	$(VEC_SIZE * 4), %r8
259	jbe	L(UnalignedLeaveCase2OrCase3)
260# endif
261	test	%edx, %edx
262	jz	L(UnalignedFourVecSizeLoop_start)
263
264L(UnalignedFourVecSizeLeave):
265	vpcmpeqb %ymm4, %ymmZ, %ymmM
266	vpmovmskb %ymmM, %edx
267	test	%edx, %edx
268	jnz	L(CopyVecSizeUnaligned_0)
269
270	vpcmpeqb %ymm5, %ymmZ, %ymmM
271	vpmovmskb %ymmM, %ecx
272	test	%ecx, %ecx
273	jnz	L(CopyVecSizeUnaligned_16)
274
275	vpcmpeqb %ymm6, %ymmZ, %ymmM
276	vpmovmskb %ymmM, %edx
277	test	%edx, %edx
278	jnz	L(CopyVecSizeUnaligned_32)
279
280	vpcmpeqb %ymm7, %ymmZ, %ymmM
281	vpmovmskb %ymmM, %ecx
282	bsf	%ecx, %edx
283	vmovdqu %ymm4, (%rdi)
284	vmovdqu %ymm5, VEC_SIZE(%rdi)
285	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
286# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
287# ifdef USE_AS_STPCPY
288	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
289# endif
290	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
291	add	$(VEC_SIZE - 1), %r8
292	sub	%rdx, %r8
293	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
294	jmp	L(StrncpyFillTailWithZero)
295# else
296	add	$(VEC_SIZE * 3), %rsi
297	add	$(VEC_SIZE * 3), %rdi
298	jmp	L(CopyVecSizeExit)
299# endif
300
301/* If source address alignment == destination address alignment */
302
303L(SourceStringAlignmentLessTwoVecSize):
304	vmovdqu (%rsi), %ymm3
305	vmovdqu VEC_SIZE(%rsi), %ymm2
306	vpcmpeqb %ymm3, %ymmZ, %ymmM
307	vpmovmskb %ymmM, %edx
308
309# ifdef USE_AS_STRNCPY
310#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
311	cmp	$VEC_SIZE, %r8
312#  else
313	cmp	$(VEC_SIZE + 1), %r8
314#  endif
315	jbe	L(CopyVecSizeTail1Case2OrCase3)
316# endif
317	test	%edx, %edx
318	jnz	L(CopyVecSizeTail1)
319
320	vmovdqu %ymm3, (%rdi)
321	vpcmpeqb %ymm2, %ymmZ, %ymmM
322	vpmovmskb %ymmM, %edx
323
324# ifdef USE_AS_STRNCPY
325#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
326	cmp	$(VEC_SIZE * 2), %r8
327#  else
328	cmp	$((VEC_SIZE * 2) + 1), %r8
329#  endif
330	jbe	L(CopyTwoVecSize1Case2OrCase3)
331# endif
332	test	%edx, %edx
333	jnz	L(CopyTwoVecSize1)
334
335	and	$-VEC_SIZE, %rsi
336	and	$(VEC_SIZE - 1), %ecx
337	jmp	L(UnalignVecSizeBoth)
338
339/*------End of main part with loops---------------------*/
340
341/* Case1 */
342
343# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
344	.p2align 4
345L(CopyVecSize):
346	add	%rcx, %rdi
347# endif
348L(CopyVecSizeTail):
349	add	%rcx, %rsi
350L(CopyVecSizeTail1):
351	bsf	%edx, %edx
352L(CopyVecSizeExit):
353	cmp	$32, %edx
354	jae	L(Exit32_63)
355	cmp	$16, %edx
356	jae	L(Exit16_31)
357	cmp	$8, %edx
358	jae	L(Exit8_15)
359	cmp	$4, %edx
360	jae	L(Exit4_7)
361	cmp	$3, %edx
362	je	L(Exit3)
363	cmp	$1, %edx
364	ja	L(Exit2)
365	je	L(Exit1)
366	movb	$0, (%rdi)
367# ifdef USE_AS_STPCPY
368	lea	(%rdi), %rax
369# endif
370# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
371	sub	$1, %r8
372	lea	1(%rdi), %rdi
373	jnz	L(StrncpyFillTailWithZero)
374# endif
375L(return_vzeroupper):
376	ZERO_UPPER_VEC_REGISTERS_RETURN
377
378	.p2align 4
379L(CopyTwoVecSize1):
380	add	$VEC_SIZE, %rsi
381	add	$VEC_SIZE, %rdi
382# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
383	sub	$VEC_SIZE, %r8
384# endif
385	jmp	L(CopyVecSizeTail1)
386
387	.p2align 4
388L(CopyTwoVecSize):
389	bsf	%edx, %edx
390	add	%rcx, %rsi
391	add	$VEC_SIZE, %edx
392	sub	%ecx, %edx
393	jmp	L(CopyVecSizeExit)
394
395	.p2align 4
396L(CopyVecSizeUnaligned_0):
397	bsf	%edx, %edx
398# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
399# ifdef USE_AS_STPCPY
400	lea	(%rdi, %rdx), %rax
401# endif
402	vmovdqu %ymm4, (%rdi)
403	add	$((VEC_SIZE * 4) - 1), %r8
404	sub	%rdx, %r8
405	lea	1(%rdi, %rdx), %rdi
406	jmp	L(StrncpyFillTailWithZero)
407# else
408	jmp	L(CopyVecSizeExit)
409# endif
410
411	.p2align 4
412L(CopyVecSizeUnaligned_16):
413	bsf	%ecx, %edx
414	vmovdqu %ymm4, (%rdi)
415# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
416# ifdef USE_AS_STPCPY
417	lea	VEC_SIZE(%rdi, %rdx), %rax
418# endif
419	vmovdqu %ymm5, VEC_SIZE(%rdi)
420	add	$((VEC_SIZE * 3) - 1), %r8
421	sub	%rdx, %r8
422	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
423	jmp	L(StrncpyFillTailWithZero)
424# else
425	add	$VEC_SIZE, %rsi
426	add	$VEC_SIZE, %rdi
427	jmp	L(CopyVecSizeExit)
428# endif
429
430	.p2align 4
431L(CopyVecSizeUnaligned_32):
432	bsf	%edx, %edx
433	vmovdqu %ymm4, (%rdi)
434	vmovdqu %ymm5, VEC_SIZE(%rdi)
435# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
436# ifdef USE_AS_STPCPY
437	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
438# endif
439	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
440	add	$((VEC_SIZE * 2) - 1), %r8
441	sub	%rdx, %r8
442	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
443	jmp	L(StrncpyFillTailWithZero)
444# else
445	add	$(VEC_SIZE * 2), %rsi
446	add	$(VEC_SIZE * 2), %rdi
447	jmp	L(CopyVecSizeExit)
448# endif
449
450# ifdef USE_AS_STRNCPY
451#  ifndef USE_AS_STRCAT
452	.p2align 4
453L(CopyVecSizeUnalignedVec6):
454	vmovdqu %ymm6, (%rdi, %rcx)
455	jmp	L(CopyVecSizeVecExit)
456
457	.p2align 4
458L(CopyVecSizeUnalignedVec5):
459	vmovdqu %ymm5, (%rdi, %rcx)
460	jmp	L(CopyVecSizeVecExit)
461
462	.p2align 4
463L(CopyVecSizeUnalignedVec4):
464	vmovdqu %ymm4, (%rdi, %rcx)
465	jmp	L(CopyVecSizeVecExit)
466
467	.p2align 4
468L(CopyVecSizeUnalignedVec3):
469	vmovdqu %ymm3, (%rdi, %rcx)
470	jmp	L(CopyVecSizeVecExit)
471#  endif
472
473/* Case2 */
474
475	.p2align 4
476L(CopyVecSizeCase2):
477	add	$VEC_SIZE, %r8
478	add	%rcx, %rdi
479	add	%rcx, %rsi
480	bsf	%edx, %edx
481	cmp	%r8d, %edx
482	jb	L(CopyVecSizeExit)
483	jmp	L(StrncpyExit)
484
485	.p2align 4
486L(CopyTwoVecSizeCase2):
487	add	%rcx, %rsi
488	bsf	%edx, %edx
489	add	$VEC_SIZE, %edx
490	sub	%ecx, %edx
491	cmp	%r8d, %edx
492	jb	L(CopyVecSizeExit)
493	jmp	L(StrncpyExit)
494
495L(CopyVecSizeTailCase2):
496	add	%rcx, %rsi
497	bsf	%edx, %edx
498	cmp	%r8d, %edx
499	jb	L(CopyVecSizeExit)
500	jmp	L(StrncpyExit)
501
502L(CopyVecSizeTail1Case2):
503	bsf	%edx, %edx
504	cmp	%r8d, %edx
505	jb	L(CopyVecSizeExit)
506	jmp	L(StrncpyExit)
507
508/* Case2 or Case3,  Case3 */
509
510	.p2align 4
511L(CopyVecSizeCase2OrCase3):
512	test	%rdx, %rdx
513	jnz	L(CopyVecSizeCase2)
514L(CopyVecSizeCase3):
515	add	$VEC_SIZE, %r8
516	add	%rcx, %rdi
517	add	%rcx, %rsi
518	jmp	L(StrncpyExit)
519
520	.p2align 4
521L(CopyTwoVecSizeCase2OrCase3):
522	test	%rdx, %rdx
523	jnz	L(CopyTwoVecSizeCase2)
524	add	%rcx, %rsi
525	jmp	L(StrncpyExit)
526
527	.p2align 4
528L(CopyVecSizeTailCase2OrCase3):
529	test	%rdx, %rdx
530	jnz	L(CopyVecSizeTailCase2)
531	add	%rcx, %rsi
532	jmp	L(StrncpyExit)
533
534	.p2align 4
535L(CopyTwoVecSize1Case2OrCase3):
536	add	$VEC_SIZE, %rdi
537	add	$VEC_SIZE, %rsi
538	sub	$VEC_SIZE, %r8
539L(CopyVecSizeTail1Case2OrCase3):
540	test	%rdx, %rdx
541	jnz	L(CopyVecSizeTail1Case2)
542	jmp	L(StrncpyExit)
543# endif
544
545/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
546
547	.p2align 4
548L(Exit1):
549	movzwl	(%rsi), %edx
550	mov	%dx, (%rdi)
551# ifdef USE_AS_STPCPY
552	lea	1(%rdi), %rax
553# endif
554# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
555	sub	$2, %r8
556	lea	2(%rdi), %rdi
557	jnz	L(StrncpyFillTailWithZero)
558# endif
559	VZEROUPPER_RETURN
560
561	.p2align 4
562L(Exit2):
563	movzwl	(%rsi), %ecx
564	mov	%cx, (%rdi)
565	movb	$0, 2(%rdi)
566# ifdef USE_AS_STPCPY
567	lea	2(%rdi), %rax
568# endif
569# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
570	sub	$3, %r8
571	lea	3(%rdi), %rdi
572	jnz	L(StrncpyFillTailWithZero)
573# endif
574	VZEROUPPER_RETURN
575
576	.p2align 4
577L(Exit3):
578	mov	(%rsi), %edx
579	mov	%edx, (%rdi)
580# ifdef USE_AS_STPCPY
581	lea	3(%rdi), %rax
582# endif
583# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
584	sub	$4, %r8
585	lea	4(%rdi), %rdi
586	jnz	L(StrncpyFillTailWithZero)
587# endif
588	VZEROUPPER_RETURN
589
590	.p2align 4
591L(Exit4_7):
592	mov	(%rsi), %ecx
593	mov	%ecx, (%rdi)
594	mov	-3(%rsi, %rdx), %ecx
595	mov	%ecx, -3(%rdi, %rdx)
596# ifdef USE_AS_STPCPY
597	lea	(%rdi, %rdx), %rax
598# endif
599# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
600	sub	%rdx, %r8
601	sub	$1, %r8
602	lea	1(%rdi, %rdx), %rdi
603	jnz	L(StrncpyFillTailWithZero)
604# endif
605	VZEROUPPER_RETURN
606
607	.p2align 4
608L(Exit8_15):
609	mov	(%rsi), %rcx
610	mov	-7(%rsi, %rdx), %r9
611	mov	%rcx, (%rdi)
612	mov	%r9, -7(%rdi, %rdx)
613# ifdef USE_AS_STPCPY
614	lea	(%rdi, %rdx), %rax
615# endif
616# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
617	sub	%rdx, %r8
618	sub	$1, %r8
619	lea	1(%rdi, %rdx), %rdi
620	jnz	L(StrncpyFillTailWithZero)
621# endif
622	VZEROUPPER_RETURN
623
624	.p2align 4
625L(Exit16_31):
626	vmovdqu (%rsi), %xmm2
627	vmovdqu -15(%rsi, %rdx), %xmm3
628	vmovdqu %xmm2, (%rdi)
629	vmovdqu %xmm3, -15(%rdi, %rdx)
630# ifdef USE_AS_STPCPY
631	lea	(%rdi, %rdx), %rax
632# endif
633# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
634	sub %rdx, %r8
635	sub $1, %r8
636	lea 1(%rdi, %rdx), %rdi
637	jnz L(StrncpyFillTailWithZero)
638# endif
639	VZEROUPPER_RETURN
640
641	.p2align 4
642L(Exit32_63):
643	vmovdqu (%rsi), %ymm2
644	vmovdqu -31(%rsi, %rdx), %ymm3
645	vmovdqu %ymm2, (%rdi)
646	vmovdqu %ymm3, -31(%rdi, %rdx)
647# ifdef USE_AS_STPCPY
648	lea	(%rdi, %rdx), %rax
649# endif
650# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
651	sub	%rdx, %r8
652	sub	$1, %r8
653	lea	1(%rdi, %rdx), %rdi
654	jnz	L(StrncpyFillTailWithZero)
655# endif
656	VZEROUPPER_RETURN
657
658# ifdef USE_AS_STRNCPY
659
660	.p2align 4
661L(StrncpyExit1):
662	movzbl	(%rsi), %edx
663	mov	%dl, (%rdi)
664#  ifdef USE_AS_STPCPY
665	lea	1(%rdi), %rax
666#  endif
667#  ifdef USE_AS_STRCAT
668	movb	$0, 1(%rdi)
669#  endif
670	VZEROUPPER_RETURN
671
672	.p2align 4
673L(StrncpyExit2):
674	movzwl	(%rsi), %edx
675	mov	%dx, (%rdi)
676#  ifdef USE_AS_STPCPY
677	lea	2(%rdi), %rax
678#  endif
679#  ifdef USE_AS_STRCAT
680	movb	$0, 2(%rdi)
681#  endif
682	VZEROUPPER_RETURN
683
684	.p2align 4
685L(StrncpyExit3_4):
686	movzwl	(%rsi), %ecx
687	movzwl	-2(%rsi, %r8), %edx
688	mov	%cx, (%rdi)
689	mov	%dx, -2(%rdi, %r8)
690#  ifdef USE_AS_STPCPY
691	lea	(%rdi, %r8), %rax
692#  endif
693#  ifdef USE_AS_STRCAT
694	movb	$0, (%rdi, %r8)
695#  endif
696	VZEROUPPER_RETURN
697
698	.p2align 4
699L(StrncpyExit5_8):
700	mov	(%rsi), %ecx
701	mov	-4(%rsi, %r8), %edx
702	mov	%ecx, (%rdi)
703	mov	%edx, -4(%rdi, %r8)
704#  ifdef USE_AS_STPCPY
705	lea	(%rdi, %r8), %rax
706#  endif
707#  ifdef USE_AS_STRCAT
708	movb	$0, (%rdi, %r8)
709#  endif
710	VZEROUPPER_RETURN
711
712	.p2align 4
713L(StrncpyExit9_16):
714	mov	(%rsi), %rcx
715	mov	-8(%rsi, %r8), %rdx
716	mov	%rcx, (%rdi)
717	mov	%rdx, -8(%rdi, %r8)
718#  ifdef USE_AS_STPCPY
719	lea	(%rdi, %r8), %rax
720#  endif
721#  ifdef USE_AS_STRCAT
722	movb	$0, (%rdi, %r8)
723#  endif
724	VZEROUPPER_RETURN
725
726	.p2align 4
727L(StrncpyExit17_32):
728	vmovdqu (%rsi), %xmm2
729	vmovdqu -16(%rsi, %r8), %xmm3
730	vmovdqu %xmm2, (%rdi)
731	vmovdqu %xmm3, -16(%rdi, %r8)
732#  ifdef USE_AS_STPCPY
733	lea	(%rdi, %r8), %rax
734#  endif
735#  ifdef USE_AS_STRCAT
736	movb	$0, (%rdi, %r8)
737#  endif
738	VZEROUPPER_RETURN
739
740	.p2align 4
741L(StrncpyExit33_64):
742	/*  0/32, 31/16 */
743	vmovdqu (%rsi), %ymm2
744	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
745	vmovdqu %ymm2, (%rdi)
746	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
747#  ifdef USE_AS_STPCPY
748	lea	(%rdi, %r8), %rax
749#  endif
750#  ifdef USE_AS_STRCAT
751	movb	$0, (%rdi, %r8)
752#  endif
753	VZEROUPPER_RETURN
754
755	.p2align 4
756L(StrncpyExit65):
757	/* 0/32, 32/32, 64/1 */
758	vmovdqu (%rsi), %ymm2
759	vmovdqu 32(%rsi), %ymm3
760	mov	64(%rsi), %cl
761	vmovdqu %ymm2, (%rdi)
762	vmovdqu %ymm3, 32(%rdi)
763	mov	%cl, 64(%rdi)
764#  ifdef USE_AS_STPCPY
765	lea	65(%rdi), %rax
766#  endif
767#  ifdef USE_AS_STRCAT
768	movb	$0, 65(%rdi)
769#  endif
770	VZEROUPPER_RETURN
771
772#  ifndef USE_AS_STRCAT
773
774	.p2align 4
775L(Fill1):
776	mov	%dl, (%rdi)
777	VZEROUPPER_RETURN
778
779	.p2align 4
780L(Fill2):
781	mov	%dx, (%rdi)
782	VZEROUPPER_RETURN
783
784	.p2align 4
785L(Fill3_4):
786	mov	%dx, (%rdi)
787	mov     %dx, -2(%rdi, %r8)
788	VZEROUPPER_RETURN
789
790	.p2align 4
791L(Fill5_8):
792	mov	%edx, (%rdi)
793	mov     %edx, -4(%rdi, %r8)
794	VZEROUPPER_RETURN
795
796	.p2align 4
797L(Fill9_16):
798	mov	%rdx, (%rdi)
799	mov	%rdx, -8(%rdi, %r8)
800	VZEROUPPER_RETURN
801
802	.p2align 4
803L(Fill17_32):
804	vmovdqu %xmmZ, (%rdi)
805	vmovdqu %xmmZ, -16(%rdi, %r8)
806	VZEROUPPER_RETURN
807
808	.p2align 4
809L(CopyVecSizeUnalignedVec2):
810	vmovdqu %ymm2, (%rdi, %rcx)
811
812	.p2align 4
813L(CopyVecSizeVecExit):
814	bsf	%edx, %edx
815	add	$(VEC_SIZE - 1), %r8
816	add	%rcx, %rdi
817#   ifdef USE_AS_STPCPY
818	lea	(%rdi, %rdx), %rax
819#   endif
820	sub	%rdx, %r8
821	lea	1(%rdi, %rdx), %rdi
822
823	.p2align 4
824L(StrncpyFillTailWithZero):
825	xor	%edx, %edx
826	sub	$VEC_SIZE, %r8
827	jbe	L(StrncpyFillExit)
828
829	vmovdqu %ymmZ, (%rdi)
830	add	$VEC_SIZE, %rdi
831
832	mov	%rdi, %rsi
833	and	$(VEC_SIZE - 1), %esi
834	sub	%rsi, %rdi
835	add	%rsi, %r8
836	sub	$(VEC_SIZE * 4), %r8
837	jb	L(StrncpyFillLessFourVecSize)
838
839L(StrncpyFillLoopVmovdqa):
840	vmovdqa %ymmZ, (%rdi)
841	vmovdqa %ymmZ, VEC_SIZE(%rdi)
842	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
843	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
844	add	$(VEC_SIZE * 4), %rdi
845	sub	$(VEC_SIZE * 4), %r8
846	jae	L(StrncpyFillLoopVmovdqa)
847
848L(StrncpyFillLessFourVecSize):
849	add	$(VEC_SIZE * 2), %r8
850	jl	L(StrncpyFillLessTwoVecSize)
851	vmovdqa %ymmZ, (%rdi)
852	vmovdqa %ymmZ, VEC_SIZE(%rdi)
853	add	$(VEC_SIZE * 2), %rdi
854	sub	$VEC_SIZE, %r8
855	jl	L(StrncpyFillExit)
856	vmovdqa %ymmZ, (%rdi)
857	add	$VEC_SIZE, %rdi
858	jmp	L(Fill)
859
860	.p2align 4
861L(StrncpyFillLessTwoVecSize):
862	add	$VEC_SIZE, %r8
863	jl	L(StrncpyFillExit)
864	vmovdqa %ymmZ, (%rdi)
865	add	$VEC_SIZE, %rdi
866	jmp	L(Fill)
867
868	.p2align 4
869L(StrncpyFillExit):
870	add	$VEC_SIZE, %r8
871L(Fill):
872	cmp	$17, %r8d
873	jae	L(Fill17_32)
874	cmp	$9, %r8d
875	jae	L(Fill9_16)
876	cmp	$5, %r8d
877	jae	L(Fill5_8)
878	cmp	$3, %r8d
879	jae	L(Fill3_4)
880	cmp	$1, %r8d
881	ja	L(Fill2)
882	je	L(Fill1)
883	VZEROUPPER_RETURN
884
885/* end of ifndef USE_AS_STRCAT */
886#  endif
887
888	.p2align 4
889L(UnalignedLeaveCase2OrCase3):
890	test	%rdx, %rdx
891	jnz	L(UnalignedFourVecSizeLeaveCase2)
892L(UnalignedFourVecSizeLeaveCase3):
893	lea	(VEC_SIZE * 4)(%r8), %rcx
894	and	$-VEC_SIZE, %rcx
895	add	$(VEC_SIZE * 3), %r8
896	jl	L(CopyVecSizeCase3)
897	vmovdqu %ymm4, (%rdi)
898	sub	$VEC_SIZE, %r8
899	jb	L(CopyVecSizeCase3)
900	vmovdqu %ymm5, VEC_SIZE(%rdi)
901	sub	$VEC_SIZE, %r8
902	jb	L(CopyVecSizeCase3)
903	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
904	sub	$VEC_SIZE, %r8
905	jb	L(CopyVecSizeCase3)
906	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
907#  ifdef USE_AS_STPCPY
908	lea	(VEC_SIZE * 4)(%rdi), %rax
909#  endif
910#  ifdef USE_AS_STRCAT
911	movb	$0, (VEC_SIZE * 4)(%rdi)
912#  endif
913	VZEROUPPER_RETURN
914
915	.p2align 4
916L(UnalignedFourVecSizeLeaveCase2):
917	xor	%ecx, %ecx
918	vpcmpeqb %ymm4, %ymmZ, %ymmM
919	vpmovmskb %ymmM, %edx
920	add	$(VEC_SIZE * 3), %r8
921	jle	L(CopyVecSizeCase2OrCase3)
922	test	%edx, %edx
923#  ifndef USE_AS_STRCAT
924	jnz	L(CopyVecSizeUnalignedVec4)
925#  else
926	jnz	L(CopyVecSize)
927#  endif
928	vpcmpeqb %ymm5, %ymmZ, %ymmM
929	vpmovmskb %ymmM, %edx
930	vmovdqu %ymm4, (%rdi)
931	add	$VEC_SIZE, %rcx
932	sub	$VEC_SIZE, %r8
933	jbe	L(CopyVecSizeCase2OrCase3)
934	test	%edx, %edx
935#  ifndef USE_AS_STRCAT
936	jnz	L(CopyVecSizeUnalignedVec5)
937#  else
938	jnz	L(CopyVecSize)
939#  endif
940
941	vpcmpeqb %ymm6, %ymmZ, %ymmM
942	vpmovmskb %ymmM, %edx
943	vmovdqu %ymm5, VEC_SIZE(%rdi)
944	add	$VEC_SIZE, %rcx
945	sub	$VEC_SIZE, %r8
946	jbe	L(CopyVecSizeCase2OrCase3)
947	test	%edx, %edx
948#  ifndef USE_AS_STRCAT
949	jnz	L(CopyVecSizeUnalignedVec6)
950#  else
951	jnz	L(CopyVecSize)
952#  endif
953
954	vpcmpeqb %ymm7, %ymmZ, %ymmM
955	vpmovmskb %ymmM, %edx
956	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
957	lea	VEC_SIZE(%rdi, %rcx), %rdi
958	lea	VEC_SIZE(%rsi, %rcx), %rsi
959	bsf	%edx, %edx
960	cmp	%r8d, %edx
961	jb	L(CopyVecSizeExit)
962L(StrncpyExit):
963	cmp	$65, %r8d
964	je	L(StrncpyExit65)
965	cmp	$33, %r8d
966	jae	L(StrncpyExit33_64)
967	cmp	$17, %r8d
968	jae	L(StrncpyExit17_32)
969	cmp	$9, %r8d
970	jae	L(StrncpyExit9_16)
971	cmp	$5, %r8d
972	jae	L(StrncpyExit5_8)
973	cmp	$3, %r8d
974	jae	L(StrncpyExit3_4)
975	cmp	$1, %r8d
976	ja	L(StrncpyExit2)
977	je	L(StrncpyExit1)
978#  ifdef USE_AS_STPCPY
979	mov	%rdi, %rax
980#  endif
981#  ifdef USE_AS_STRCAT
982	movb	$0, (%rdi)
983#  endif
984	VZEROUPPER_RETURN
985
986	.p2align 4
987L(ExitZero):
988#  ifndef USE_AS_STRCAT
989	mov	%rdi, %rax
990#  endif
991	VZEROUPPER_RETURN
992
993# endif
994
995# ifndef USE_AS_STRCAT
996END (STRCPY)
997# else
998END (STRCAT)
999# endif
1000#endif
1001