1/* strcpy with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# ifndef USE_AS_STRCAT
22#  include <sysdep.h>
23
24#  ifndef STRCPY
25#   define STRCPY  __strcpy_evex
26#  endif
27
28# endif
29
30# define VMOVU		vmovdqu64
31# define VMOVA		vmovdqa64
32
33/* Number of bytes in a vector register */
34# ifndef VEC_SIZE
35#  define VEC_SIZE	32
36# endif
37
38# define XMM2		xmm18
39# define XMM3		xmm19
40
41# define YMM2		ymm18
42# define YMM3		ymm19
43# define YMM4		ymm20
44# define YMM5		ymm21
45# define YMM6		ymm22
46# define YMM7		ymm23
47
48# ifndef USE_AS_STRCAT
49
50/* zero register */
51#  define XMMZERO	xmm16
52#  define YMMZERO	ymm16
53#  define YMM1		ymm17
54
55	.section .text.evex,"ax",@progbits
56ENTRY (STRCPY)
57#  ifdef USE_AS_STRNCPY
58	mov	%RDX_LP, %R8_LP
59	test	%R8_LP, %R8_LP
60	jz	L(ExitZero)
61#  endif
62	mov	%rsi, %rcx
63#  ifndef USE_AS_STPCPY
64	mov	%rdi, %rax      /* save result */
65#  endif
66
67	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
68# endif
69
70	and	$((VEC_SIZE * 4) - 1), %ecx
71	cmp	$(VEC_SIZE * 2), %ecx
72	jbe	L(SourceStringAlignmentLessTwoVecSize)
73
74	and	$-VEC_SIZE, %rsi
75	and	$(VEC_SIZE - 1), %ecx
76
77	vpcmpb	$0, (%rsi), %YMMZERO, %k0
78	kmovd	%k0, %edx
79	shr	%cl, %rdx
80
81# ifdef USE_AS_STRNCPY
82#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
83	mov	$VEC_SIZE, %r10
84	sub	%rcx, %r10
85	cmp	%r10, %r8
86#  else
87	mov	$(VEC_SIZE + 1), %r10
88	sub	%rcx, %r10
89	cmp	%r10, %r8
90#  endif
91	jbe	L(CopyVecSizeTailCase2OrCase3)
92# endif
93	test	%edx, %edx
94	jnz	L(CopyVecSizeTail)
95
96	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
97	kmovd	%k1, %edx
98
99# ifdef USE_AS_STRNCPY
100	add	$VEC_SIZE, %r10
101	cmp	%r10, %r8
102	jbe	L(CopyTwoVecSizeCase2OrCase3)
103# endif
104	test	%edx, %edx
105	jnz	L(CopyTwoVecSize)
106
107	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
108	VMOVU	%YMM2, (%rdi)
109
110/* If source address alignment != destination address alignment */
111	.p2align 4
112L(UnalignVecSizeBoth):
113	sub	%rcx, %rdi
114# ifdef USE_AS_STRNCPY
115	add	%rcx, %r8
116	sbb	%rcx, %rcx
117	or	%rcx, %r8
118# endif
119	mov	$VEC_SIZE, %rcx
120	VMOVA	(%rsi, %rcx), %YMM2
121	VMOVU	%YMM2, (%rdi, %rcx)
122	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
123	vpcmpb	$0, %YMM2, %YMMZERO, %k0
124	kmovd	%k0, %edx
125	add	$VEC_SIZE, %rcx
126# ifdef USE_AS_STRNCPY
127	sub	$(VEC_SIZE * 3), %r8
128	jbe	L(CopyVecSizeCase2OrCase3)
129# endif
130	test	%edx, %edx
131# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
132	jnz	L(CopyVecSizeUnalignedVec2)
133# else
134	jnz	L(CopyVecSize)
135# endif
136
137	VMOVU	%YMM2, (%rdi, %rcx)
138	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
139	vpcmpb	$0, %YMM3, %YMMZERO, %k0
140	kmovd	%k0, %edx
141	add	$VEC_SIZE, %rcx
142# ifdef USE_AS_STRNCPY
143	sub	$VEC_SIZE, %r8
144	jbe	L(CopyVecSizeCase2OrCase3)
145# endif
146	test	%edx, %edx
147# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
148	jnz	L(CopyVecSizeUnalignedVec3)
149# else
150	jnz	L(CopyVecSize)
151# endif
152
153	VMOVU	%YMM3, (%rdi, %rcx)
154	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
155	vpcmpb	$0, %YMM4, %YMMZERO, %k0
156	kmovd	%k0, %edx
157	add	$VEC_SIZE, %rcx
158# ifdef USE_AS_STRNCPY
159	sub	$VEC_SIZE, %r8
160	jbe	L(CopyVecSizeCase2OrCase3)
161# endif
162	test	%edx, %edx
163# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
164	jnz	L(CopyVecSizeUnalignedVec4)
165# else
166	jnz	L(CopyVecSize)
167# endif
168
169	VMOVU	%YMM4, (%rdi, %rcx)
170	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
171	vpcmpb	$0, %YMM2, %YMMZERO, %k0
172	kmovd	%k0, %edx
173	add	$VEC_SIZE, %rcx
174# ifdef USE_AS_STRNCPY
175	sub	$VEC_SIZE, %r8
176	jbe	L(CopyVecSizeCase2OrCase3)
177# endif
178	test	%edx, %edx
179# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
180	jnz	L(CopyVecSizeUnalignedVec2)
181# else
182	jnz	L(CopyVecSize)
183# endif
184
185	VMOVU	%YMM2, (%rdi, %rcx)
186	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
187	vpcmpb	$0, %YMM2, %YMMZERO, %k0
188	kmovd	%k0, %edx
189	add	$VEC_SIZE, %rcx
190# ifdef USE_AS_STRNCPY
191	sub	$VEC_SIZE, %r8
192	jbe	L(CopyVecSizeCase2OrCase3)
193# endif
194	test	%edx, %edx
195# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
196	jnz	L(CopyVecSizeUnalignedVec2)
197# else
198	jnz	L(CopyVecSize)
199# endif
200
201	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
202	VMOVU	%YMM2, (%rdi, %rcx)
203	vpcmpb	$0, %YMM3, %YMMZERO, %k0
204	kmovd	%k0, %edx
205	add	$VEC_SIZE, %rcx
206# ifdef USE_AS_STRNCPY
207	sub	$VEC_SIZE, %r8
208	jbe	L(CopyVecSizeCase2OrCase3)
209# endif
210	test	%edx, %edx
211# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
212	jnz	L(CopyVecSizeUnalignedVec3)
213# else
214	jnz	L(CopyVecSize)
215# endif
216
217	VMOVU	%YMM3, (%rdi, %rcx)
218	mov	%rsi, %rdx
219	lea	VEC_SIZE(%rsi, %rcx), %rsi
220	and	$-(VEC_SIZE * 4), %rsi
221	sub	%rsi, %rdx
222	sub	%rdx, %rdi
223# ifdef USE_AS_STRNCPY
224	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
225# endif
226L(UnalignedFourVecSizeLoop):
227	VMOVA	(%rsi), %YMM4
228	VMOVA	VEC_SIZE(%rsi), %YMM5
229	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
230	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
231	vpminub	%YMM5, %YMM4, %YMM2
232	vpminub	%YMM7, %YMM6, %YMM3
233	vpminub	%YMM2, %YMM3, %YMM2
234	/* If K7 != 0, there is a null byte.  */
235	vpcmpb	$0, %YMM2, %YMMZERO, %k7
236	kmovd	%k7, %edx
237# ifdef USE_AS_STRNCPY
238	sub	$(VEC_SIZE * 4), %r8
239	jbe	L(UnalignedLeaveCase2OrCase3)
240# endif
241	test	%edx, %edx
242	jnz	L(UnalignedFourVecSizeLeave)
243
244L(UnalignedFourVecSizeLoop_start):
245	add	$(VEC_SIZE * 4), %rdi
246	add	$(VEC_SIZE * 4), %rsi
247	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
248	VMOVA	(%rsi), %YMM4
249	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
250	VMOVA	VEC_SIZE(%rsi), %YMM5
251	vpminub	%YMM5, %YMM4, %YMM2
252	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
253	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
254	VMOVU	%YMM7, -VEC_SIZE(%rdi)
255	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
256	vpminub	%YMM7, %YMM6, %YMM3
257	vpminub	%YMM2, %YMM3, %YMM2
258	/* If K7 != 0, there is a null byte.  */
259	vpcmpb	$0, %YMM2, %YMMZERO, %k7
260	kmovd	%k7, %edx
261# ifdef USE_AS_STRNCPY
262	sub	$(VEC_SIZE * 4), %r8
263	jbe	L(UnalignedLeaveCase2OrCase3)
264# endif
265	test	%edx, %edx
266	jz	L(UnalignedFourVecSizeLoop_start)
267
268L(UnalignedFourVecSizeLeave):
269	vpcmpb	$0, %YMM4, %YMMZERO, %k1
270	kmovd	%k1, %edx
271	test	%edx, %edx
272	jnz	L(CopyVecSizeUnaligned_0)
273
274	vpcmpb	$0, %YMM5, %YMMZERO, %k2
275	kmovd	%k2, %ecx
276	test	%ecx, %ecx
277	jnz	L(CopyVecSizeUnaligned_16)
278
279	vpcmpb	$0, %YMM6, %YMMZERO, %k3
280	kmovd	%k3, %edx
281	test	%edx, %edx
282	jnz	L(CopyVecSizeUnaligned_32)
283
284	vpcmpb	$0, %YMM7, %YMMZERO, %k4
285	kmovd	%k4, %ecx
286	bsf	%ecx, %edx
287	VMOVU	%YMM4, (%rdi)
288	VMOVU	%YMM5, VEC_SIZE(%rdi)
289	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
290# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
291# ifdef USE_AS_STPCPY
292	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
293# endif
294	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
295	add	$(VEC_SIZE - 1), %r8
296	sub	%rdx, %r8
297	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
298	jmp	L(StrncpyFillTailWithZero)
299# else
300	add	$(VEC_SIZE * 3), %rsi
301	add	$(VEC_SIZE * 3), %rdi
302	jmp	L(CopyVecSizeExit)
303# endif
304
305/* If source address alignment == destination address alignment */
306
307L(SourceStringAlignmentLessTwoVecSize):
308	VMOVU	(%rsi), %YMM3
309	VMOVU	VEC_SIZE(%rsi), %YMM2
310	vpcmpb	$0, %YMM3, %YMMZERO, %k0
311	kmovd	%k0, %edx
312
313# ifdef USE_AS_STRNCPY
314#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
315	cmp	$VEC_SIZE, %r8
316#  else
317	cmp	$(VEC_SIZE + 1), %r8
318#  endif
319	jbe	L(CopyVecSizeTail1Case2OrCase3)
320# endif
321	test	%edx, %edx
322	jnz	L(CopyVecSizeTail1)
323
324	VMOVU	%YMM3, (%rdi)
325	vpcmpb	$0, %YMM2, %YMMZERO, %k0
326	kmovd	%k0, %edx
327
328# ifdef USE_AS_STRNCPY
329#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
330	cmp	$(VEC_SIZE * 2), %r8
331#  else
332	cmp	$((VEC_SIZE * 2) + 1), %r8
333#  endif
334	jbe	L(CopyTwoVecSize1Case2OrCase3)
335# endif
336	test	%edx, %edx
337	jnz	L(CopyTwoVecSize1)
338
339	and	$-VEC_SIZE, %rsi
340	and	$(VEC_SIZE - 1), %ecx
341	jmp	L(UnalignVecSizeBoth)
342
343/*------End of main part with loops---------------------*/
344
345/* Case1 */
346
347# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
348	.p2align 4
349L(CopyVecSize):
350	add	%rcx, %rdi
351# endif
352L(CopyVecSizeTail):
353	add	%rcx, %rsi
354L(CopyVecSizeTail1):
355	bsf	%edx, %edx
356L(CopyVecSizeExit):
357	cmp	$32, %edx
358	jae	L(Exit32_63)
359	cmp	$16, %edx
360	jae	L(Exit16_31)
361	cmp	$8, %edx
362	jae	L(Exit8_15)
363	cmp	$4, %edx
364	jae	L(Exit4_7)
365	cmp	$3, %edx
366	je	L(Exit3)
367	cmp	$1, %edx
368	ja	L(Exit2)
369	je	L(Exit1)
370	movb	$0, (%rdi)
371# ifdef USE_AS_STPCPY
372	lea	(%rdi), %rax
373# endif
374# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
375	sub	$1, %r8
376	lea	1(%rdi), %rdi
377	jnz	L(StrncpyFillTailWithZero)
378# endif
379	ret
380
381	.p2align 4
382L(CopyTwoVecSize1):
383	add	$VEC_SIZE, %rsi
384	add	$VEC_SIZE, %rdi
385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
386	sub	$VEC_SIZE, %r8
387# endif
388	jmp	L(CopyVecSizeTail1)
389
390	.p2align 4
391L(CopyTwoVecSize):
392	bsf	%edx, %edx
393	add	%rcx, %rsi
394	add	$VEC_SIZE, %edx
395	sub	%ecx, %edx
396	jmp	L(CopyVecSizeExit)
397
398	.p2align 4
399L(CopyVecSizeUnaligned_0):
400	bsf	%edx, %edx
401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
402# ifdef USE_AS_STPCPY
403	lea	(%rdi, %rdx), %rax
404# endif
405	VMOVU	%YMM4, (%rdi)
406	add	$((VEC_SIZE * 4) - 1), %r8
407	sub	%rdx, %r8
408	lea	1(%rdi, %rdx), %rdi
409	jmp	L(StrncpyFillTailWithZero)
410# else
411	jmp	L(CopyVecSizeExit)
412# endif
413
414	.p2align 4
415L(CopyVecSizeUnaligned_16):
416	bsf	%ecx, %edx
417	VMOVU	%YMM4, (%rdi)
418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
419# ifdef USE_AS_STPCPY
420	lea	VEC_SIZE(%rdi, %rdx), %rax
421# endif
422	VMOVU	%YMM5, VEC_SIZE(%rdi)
423	add	$((VEC_SIZE * 3) - 1), %r8
424	sub	%rdx, %r8
425	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
426	jmp	L(StrncpyFillTailWithZero)
427# else
428	add	$VEC_SIZE, %rsi
429	add	$VEC_SIZE, %rdi
430	jmp	L(CopyVecSizeExit)
431# endif
432
433	.p2align 4
434L(CopyVecSizeUnaligned_32):
435	bsf	%edx, %edx
436	VMOVU	%YMM4, (%rdi)
437	VMOVU	%YMM5, VEC_SIZE(%rdi)
438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
439# ifdef USE_AS_STPCPY
440	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
441# endif
442	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
443	add	$((VEC_SIZE * 2) - 1), %r8
444	sub	%rdx, %r8
445	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
446	jmp	L(StrncpyFillTailWithZero)
447# else
448	add	$(VEC_SIZE * 2), %rsi
449	add	$(VEC_SIZE * 2), %rdi
450	jmp	L(CopyVecSizeExit)
451# endif
452
453# ifdef USE_AS_STRNCPY
454#  ifndef USE_AS_STRCAT
455	.p2align 4
456L(CopyVecSizeUnalignedVec6):
457	VMOVU	%YMM6, (%rdi, %rcx)
458	jmp	L(CopyVecSizeVecExit)
459
460	.p2align 4
461L(CopyVecSizeUnalignedVec5):
462	VMOVU	%YMM5, (%rdi, %rcx)
463	jmp	L(CopyVecSizeVecExit)
464
465	.p2align 4
466L(CopyVecSizeUnalignedVec4):
467	VMOVU	%YMM4, (%rdi, %rcx)
468	jmp	L(CopyVecSizeVecExit)
469
470	.p2align 4
471L(CopyVecSizeUnalignedVec3):
472	VMOVU	%YMM3, (%rdi, %rcx)
473	jmp	L(CopyVecSizeVecExit)
474#  endif
475
476/* Case2 */
477
478	.p2align 4
479L(CopyVecSizeCase2):
480	add	$VEC_SIZE, %r8
481	add	%rcx, %rdi
482	add	%rcx, %rsi
483	bsf	%edx, %edx
484	cmp	%r8d, %edx
485	jb	L(CopyVecSizeExit)
486	jmp	L(StrncpyExit)
487
488	.p2align 4
489L(CopyTwoVecSizeCase2):
490	add	%rcx, %rsi
491	bsf	%edx, %edx
492	add	$VEC_SIZE, %edx
493	sub	%ecx, %edx
494	cmp	%r8d, %edx
495	jb	L(CopyVecSizeExit)
496	jmp	L(StrncpyExit)
497
498L(CopyVecSizeTailCase2):
499	add	%rcx, %rsi
500	bsf	%edx, %edx
501	cmp	%r8d, %edx
502	jb	L(CopyVecSizeExit)
503	jmp	L(StrncpyExit)
504
505L(CopyVecSizeTail1Case2):
506	bsf	%edx, %edx
507	cmp	%r8d, %edx
508	jb	L(CopyVecSizeExit)
509	jmp	L(StrncpyExit)
510
511/* Case2 or Case3,  Case3 */
512
513	.p2align 4
514L(CopyVecSizeCase2OrCase3):
515	test	%rdx, %rdx
516	jnz	L(CopyVecSizeCase2)
517L(CopyVecSizeCase3):
518	add	$VEC_SIZE, %r8
519	add	%rcx, %rdi
520	add	%rcx, %rsi
521	jmp	L(StrncpyExit)
522
523	.p2align 4
524L(CopyTwoVecSizeCase2OrCase3):
525	test	%rdx, %rdx
526	jnz	L(CopyTwoVecSizeCase2)
527	add	%rcx, %rsi
528	jmp	L(StrncpyExit)
529
530	.p2align 4
531L(CopyVecSizeTailCase2OrCase3):
532	test	%rdx, %rdx
533	jnz	L(CopyVecSizeTailCase2)
534	add	%rcx, %rsi
535	jmp	L(StrncpyExit)
536
537	.p2align 4
538L(CopyTwoVecSize1Case2OrCase3):
539	add	$VEC_SIZE, %rdi
540	add	$VEC_SIZE, %rsi
541	sub	$VEC_SIZE, %r8
542L(CopyVecSizeTail1Case2OrCase3):
543	test	%rdx, %rdx
544	jnz	L(CopyVecSizeTail1Case2)
545	jmp	L(StrncpyExit)
546# endif
547
548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
549
550	.p2align 4
551L(Exit1):
552	movzwl	(%rsi), %edx
553	mov	%dx, (%rdi)
554# ifdef USE_AS_STPCPY
555	lea	1(%rdi), %rax
556# endif
557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
558	sub	$2, %r8
559	lea	2(%rdi), %rdi
560	jnz	L(StrncpyFillTailWithZero)
561# endif
562	ret
563
564	.p2align 4
565L(Exit2):
566	movzwl	(%rsi), %ecx
567	mov	%cx, (%rdi)
568	movb	$0, 2(%rdi)
569# ifdef USE_AS_STPCPY
570	lea	2(%rdi), %rax
571# endif
572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
573	sub	$3, %r8
574	lea	3(%rdi), %rdi
575	jnz	L(StrncpyFillTailWithZero)
576# endif
577	ret
578
579	.p2align 4
580L(Exit3):
581	mov	(%rsi), %edx
582	mov	%edx, (%rdi)
583# ifdef USE_AS_STPCPY
584	lea	3(%rdi), %rax
585# endif
586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
587	sub	$4, %r8
588	lea	4(%rdi), %rdi
589	jnz	L(StrncpyFillTailWithZero)
590# endif
591	ret
592
593	.p2align 4
594L(Exit4_7):
595	mov	(%rsi), %ecx
596	mov	%ecx, (%rdi)
597	mov	-3(%rsi, %rdx), %ecx
598	mov	%ecx, -3(%rdi, %rdx)
599# ifdef USE_AS_STPCPY
600	lea	(%rdi, %rdx), %rax
601# endif
602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
603	sub	%rdx, %r8
604	sub	$1, %r8
605	lea	1(%rdi, %rdx), %rdi
606	jnz	L(StrncpyFillTailWithZero)
607# endif
608	ret
609
610	.p2align 4
611L(Exit8_15):
612	mov	(%rsi), %rcx
613	mov	-7(%rsi, %rdx), %r9
614	mov	%rcx, (%rdi)
615	mov	%r9, -7(%rdi, %rdx)
616# ifdef USE_AS_STPCPY
617	lea	(%rdi, %rdx), %rax
618# endif
619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
620	sub	%rdx, %r8
621	sub	$1, %r8
622	lea	1(%rdi, %rdx), %rdi
623	jnz	L(StrncpyFillTailWithZero)
624# endif
625	ret
626
627	.p2align 4
628L(Exit16_31):
629	VMOVU	(%rsi), %XMM2
630	VMOVU	-15(%rsi, %rdx), %XMM3
631	VMOVU	%XMM2, (%rdi)
632	VMOVU	%XMM3, -15(%rdi, %rdx)
633# ifdef USE_AS_STPCPY
634	lea	(%rdi, %rdx), %rax
635# endif
636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
637	sub %rdx, %r8
638	sub $1, %r8
639	lea 1(%rdi, %rdx), %rdi
640	jnz L(StrncpyFillTailWithZero)
641# endif
642	ret
643
644	.p2align 4
645L(Exit32_63):
646	VMOVU	(%rsi), %YMM2
647	VMOVU	-31(%rsi, %rdx), %YMM3
648	VMOVU	%YMM2, (%rdi)
649	VMOVU	%YMM3, -31(%rdi, %rdx)
650# ifdef USE_AS_STPCPY
651	lea	(%rdi, %rdx), %rax
652# endif
653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
654	sub	%rdx, %r8
655	sub	$1, %r8
656	lea	1(%rdi, %rdx), %rdi
657	jnz	L(StrncpyFillTailWithZero)
658# endif
659	ret
660
661# ifdef USE_AS_STRNCPY
662
663	.p2align 4
664L(StrncpyExit1):
665	movzbl	(%rsi), %edx
666	mov	%dl, (%rdi)
667#  ifdef USE_AS_STPCPY
668	lea	1(%rdi), %rax
669#  endif
670#  ifdef USE_AS_STRCAT
671	movb	$0, 1(%rdi)
672#  endif
673	ret
674
675	.p2align 4
676L(StrncpyExit2):
677	movzwl	(%rsi), %edx
678	mov	%dx, (%rdi)
679#  ifdef USE_AS_STPCPY
680	lea	2(%rdi), %rax
681#  endif
682#  ifdef USE_AS_STRCAT
683	movb	$0, 2(%rdi)
684#  endif
685	ret
686
687	.p2align 4
688L(StrncpyExit3_4):
689	movzwl	(%rsi), %ecx
690	movzwl	-2(%rsi, %r8), %edx
691	mov	%cx, (%rdi)
692	mov	%dx, -2(%rdi, %r8)
693#  ifdef USE_AS_STPCPY
694	lea	(%rdi, %r8), %rax
695#  endif
696#  ifdef USE_AS_STRCAT
697	movb	$0, (%rdi, %r8)
698#  endif
699	ret
700
701	.p2align 4
702L(StrncpyExit5_8):
703	mov	(%rsi), %ecx
704	mov	-4(%rsi, %r8), %edx
705	mov	%ecx, (%rdi)
706	mov	%edx, -4(%rdi, %r8)
707#  ifdef USE_AS_STPCPY
708	lea	(%rdi, %r8), %rax
709#  endif
710#  ifdef USE_AS_STRCAT
711	movb	$0, (%rdi, %r8)
712#  endif
713	ret
714
715	.p2align 4
716L(StrncpyExit9_16):
717	mov	(%rsi), %rcx
718	mov	-8(%rsi, %r8), %rdx
719	mov	%rcx, (%rdi)
720	mov	%rdx, -8(%rdi, %r8)
721#  ifdef USE_AS_STPCPY
722	lea	(%rdi, %r8), %rax
723#  endif
724#  ifdef USE_AS_STRCAT
725	movb	$0, (%rdi, %r8)
726#  endif
727	ret
728
729	.p2align 4
730L(StrncpyExit17_32):
731	VMOVU	(%rsi), %XMM2
732	VMOVU	-16(%rsi, %r8), %XMM3
733	VMOVU	%XMM2, (%rdi)
734	VMOVU	%XMM3, -16(%rdi, %r8)
735#  ifdef USE_AS_STPCPY
736	lea	(%rdi, %r8), %rax
737#  endif
738#  ifdef USE_AS_STRCAT
739	movb	$0, (%rdi, %r8)
740#  endif
741	ret
742
743	.p2align 4
744L(StrncpyExit33_64):
745	/*  0/32, 31/16 */
746	VMOVU	(%rsi), %YMM2
747	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
748	VMOVU	%YMM2, (%rdi)
749	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
750#  ifdef USE_AS_STPCPY
751	lea	(%rdi, %r8), %rax
752#  endif
753#  ifdef USE_AS_STRCAT
754	movb	$0, (%rdi, %r8)
755#  endif
756	ret
757
758	.p2align 4
759L(StrncpyExit65):
760	/* 0/32, 32/32, 64/1 */
761	VMOVU	(%rsi), %YMM2
762	VMOVU	32(%rsi), %YMM3
763	mov	64(%rsi), %cl
764	VMOVU	%YMM2, (%rdi)
765	VMOVU	%YMM3, 32(%rdi)
766	mov	%cl, 64(%rdi)
767#  ifdef USE_AS_STPCPY
768	lea	65(%rdi), %rax
769#  endif
770#  ifdef USE_AS_STRCAT
771	movb	$0, 65(%rdi)
772#  endif
773	ret
774
775#  ifndef USE_AS_STRCAT
776
777	.p2align 4
778L(Fill1):
779	mov	%dl, (%rdi)
780	ret
781
782	.p2align 4
783L(Fill2):
784	mov	%dx, (%rdi)
785	ret
786
787	.p2align 4
788L(Fill3_4):
789	mov	%dx, (%rdi)
790	mov     %dx, -2(%rdi, %r8)
791	ret
792
793	.p2align 4
794L(Fill5_8):
795	mov	%edx, (%rdi)
796	mov     %edx, -4(%rdi, %r8)
797	ret
798
799	.p2align 4
800L(Fill9_16):
801	mov	%rdx, (%rdi)
802	mov	%rdx, -8(%rdi, %r8)
803	ret
804
805	.p2align 4
806L(Fill17_32):
807	VMOVU	%XMMZERO, (%rdi)
808	VMOVU	%XMMZERO, -16(%rdi, %r8)
809	ret
810
811	.p2align 4
812L(CopyVecSizeUnalignedVec2):
813	VMOVU	%YMM2, (%rdi, %rcx)
814
815	.p2align 4
816L(CopyVecSizeVecExit):
817	bsf	%edx, %edx
818	add	$(VEC_SIZE - 1), %r8
819	add	%rcx, %rdi
820#   ifdef USE_AS_STPCPY
821	lea	(%rdi, %rdx), %rax
822#   endif
823	sub	%rdx, %r8
824	lea	1(%rdi, %rdx), %rdi
825
826	.p2align 4
827L(StrncpyFillTailWithZero):
828	xor	%edx, %edx
829	sub	$VEC_SIZE, %r8
830	jbe	L(StrncpyFillExit)
831
832	VMOVU	%YMMZERO, (%rdi)
833	add	$VEC_SIZE, %rdi
834
835	mov	%rdi, %rsi
836	and	$(VEC_SIZE - 1), %esi
837	sub	%rsi, %rdi
838	add	%rsi, %r8
839	sub	$(VEC_SIZE * 4), %r8
840	jb	L(StrncpyFillLessFourVecSize)
841
842L(StrncpyFillLoopVmovdqa):
843	VMOVA	%YMMZERO, (%rdi)
844	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
845	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
846	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
847	add	$(VEC_SIZE * 4), %rdi
848	sub	$(VEC_SIZE * 4), %r8
849	jae	L(StrncpyFillLoopVmovdqa)
850
851L(StrncpyFillLessFourVecSize):
852	add	$(VEC_SIZE * 2), %r8
853	jl	L(StrncpyFillLessTwoVecSize)
854	VMOVA	%YMMZERO, (%rdi)
855	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
856	add	$(VEC_SIZE * 2), %rdi
857	sub	$VEC_SIZE, %r8
858	jl	L(StrncpyFillExit)
859	VMOVA	%YMMZERO, (%rdi)
860	add	$VEC_SIZE, %rdi
861	jmp	L(Fill)
862
863	.p2align 4
864L(StrncpyFillLessTwoVecSize):
865	add	$VEC_SIZE, %r8
866	jl	L(StrncpyFillExit)
867	VMOVA	%YMMZERO, (%rdi)
868	add	$VEC_SIZE, %rdi
869	jmp	L(Fill)
870
871	.p2align 4
872L(StrncpyFillExit):
873	add	$VEC_SIZE, %r8
874L(Fill):
875	cmp	$17, %r8d
876	jae	L(Fill17_32)
877	cmp	$9, %r8d
878	jae	L(Fill9_16)
879	cmp	$5, %r8d
880	jae	L(Fill5_8)
881	cmp	$3, %r8d
882	jae	L(Fill3_4)
883	cmp	$1, %r8d
884	ja	L(Fill2)
885	je	L(Fill1)
886	ret
887
888/* end of ifndef USE_AS_STRCAT */
889#  endif
890
891	.p2align 4
892L(UnalignedLeaveCase2OrCase3):
893	test	%rdx, %rdx
894	jnz	L(UnalignedFourVecSizeLeaveCase2)
895L(UnalignedFourVecSizeLeaveCase3):
896	lea	(VEC_SIZE * 4)(%r8), %rcx
897	and	$-VEC_SIZE, %rcx
898	add	$(VEC_SIZE * 3), %r8
899	jl	L(CopyVecSizeCase3)
900	VMOVU	%YMM4, (%rdi)
901	sub	$VEC_SIZE, %r8
902	jb	L(CopyVecSizeCase3)
903	VMOVU	%YMM5, VEC_SIZE(%rdi)
904	sub	$VEC_SIZE, %r8
905	jb	L(CopyVecSizeCase3)
906	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
907	sub	$VEC_SIZE, %r8
908	jb	L(CopyVecSizeCase3)
909	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
910#  ifdef USE_AS_STPCPY
911	lea	(VEC_SIZE * 4)(%rdi), %rax
912#  endif
913#  ifdef USE_AS_STRCAT
914	movb	$0, (VEC_SIZE * 4)(%rdi)
915#  endif
916	ret
917
918	.p2align 4
919L(UnalignedFourVecSizeLeaveCase2):
920	xor	%ecx, %ecx
921	vpcmpb	$0, %YMM4, %YMMZERO, %k1
922	kmovd	%k1, %edx
923	add	$(VEC_SIZE * 3), %r8
924	jle	L(CopyVecSizeCase2OrCase3)
925	test	%edx, %edx
926#  ifndef USE_AS_STRCAT
927	jnz	L(CopyVecSizeUnalignedVec4)
928#  else
929	jnz	L(CopyVecSize)
930#  endif
931	vpcmpb	$0, %YMM5, %YMMZERO, %k2
932	kmovd	%k2, %edx
933	VMOVU	%YMM4, (%rdi)
934	add	$VEC_SIZE, %rcx
935	sub	$VEC_SIZE, %r8
936	jbe	L(CopyVecSizeCase2OrCase3)
937	test	%edx, %edx
938#  ifndef USE_AS_STRCAT
939	jnz	L(CopyVecSizeUnalignedVec5)
940#  else
941	jnz	L(CopyVecSize)
942#  endif
943
944	vpcmpb	$0, %YMM6, %YMMZERO, %k3
945	kmovd	%k3, %edx
946	VMOVU	%YMM5, VEC_SIZE(%rdi)
947	add	$VEC_SIZE, %rcx
948	sub	$VEC_SIZE, %r8
949	jbe	L(CopyVecSizeCase2OrCase3)
950	test	%edx, %edx
951#  ifndef USE_AS_STRCAT
952	jnz	L(CopyVecSizeUnalignedVec6)
953#  else
954	jnz	L(CopyVecSize)
955#  endif
956
957	vpcmpb	$0, %YMM7, %YMMZERO, %k4
958	kmovd	%k4, %edx
959	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
960	lea	VEC_SIZE(%rdi, %rcx), %rdi
961	lea	VEC_SIZE(%rsi, %rcx), %rsi
962	bsf	%edx, %edx
963	cmp	%r8d, %edx
964	jb	L(CopyVecSizeExit)
965L(StrncpyExit):
966	cmp	$65, %r8d
967	je	L(StrncpyExit65)
968	cmp	$33, %r8d
969	jae	L(StrncpyExit33_64)
970	cmp	$17, %r8d
971	jae	L(StrncpyExit17_32)
972	cmp	$9, %r8d
973	jae	L(StrncpyExit9_16)
974	cmp	$5, %r8d
975	jae	L(StrncpyExit5_8)
976	cmp	$3, %r8d
977	jae	L(StrncpyExit3_4)
978	cmp	$1, %r8d
979	ja	L(StrncpyExit2)
980	je	L(StrncpyExit1)
981#  ifdef USE_AS_STPCPY
982	mov	%rdi, %rax
983#  endif
984#  ifdef USE_AS_STRCAT
985	movb	$0, (%rdi)
986#  endif
987	ret
988
989	.p2align 4
990L(ExitZero):
991#  ifndef USE_AS_STRCAT
992	mov	%rdi, %rax
993#  endif
994	ret
995
996# endif
997
998# ifndef USE_AS_STRCAT
999END (STRCPY)
1000# else
1001END (STRCAT)
1002# endif
1003#endif
1004