1/* strcat with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24#  define STRCAT  __strcat_evex
25# endif
26
27# define VMOVU		vmovdqu64
28# define VMOVA		vmovdqa64
29
30/* zero register */
31# define XMMZERO	xmm16
32# define YMMZERO	ymm16
33# define YMM0		ymm17
34# define YMM1		ymm18
35
36# define USE_AS_STRCAT
37
38/* Number of bytes in a vector register */
39# define VEC_SIZE	32
40
41	.section .text.evex,"ax",@progbits
42ENTRY (STRCAT)
43	mov	%rdi, %r9
44# ifdef USE_AS_STRNCAT
45	mov	%rdx, %r8
46# endif
47
48	xor	%eax, %eax
49	mov	%edi, %ecx
50	and	$((VEC_SIZE * 4) - 1), %ecx
51	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
52	cmp	$(VEC_SIZE * 3), %ecx
53	ja	L(fourth_vector_boundary)
54	vpcmpb	$0, (%rdi), %YMMZERO, %k0
55	kmovd	%k0, %edx
56	test	%edx, %edx
57	jnz	L(exit_null_on_first_vector)
58	mov	%rdi, %rax
59	and	$-VEC_SIZE, %rax
60	jmp	L(align_vec_size_start)
61L(fourth_vector_boundary):
62	mov	%rdi, %rax
63	and	$-VEC_SIZE, %rax
64	vpcmpb	$0, (%rax), %YMMZERO, %k0
65	mov	$-1, %r10d
66	sub	%rax, %rcx
67	shl	%cl, %r10d
68	kmovd	%k0, %edx
69	and	%r10d, %edx
70	jnz	L(exit)
71
72L(align_vec_size_start):
73	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
74	kmovd	%k0, %edx
75	test	%edx, %edx
76	jnz	L(exit_null_on_second_vector)
77
78	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
79	kmovd	%k1, %edx
80	test	%edx, %edx
81	jnz	L(exit_null_on_third_vector)
82
83	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
84	kmovd	%k2, %edx
85	test	%edx, %edx
86	jnz	L(exit_null_on_fourth_vector)
87
88	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
89	kmovd	%k3, %edx
90	test	%edx, %edx
91	jnz	L(exit_null_on_fifth_vector)
92
93	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
94	add	$(VEC_SIZE * 4), %rax
95	kmovd	%k4, %edx
96	test	%edx, %edx
97	jnz	L(exit_null_on_second_vector)
98
99	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
100	kmovd	%k1, %edx
101	test	%edx, %edx
102	jnz	L(exit_null_on_third_vector)
103
104	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
105	kmovd	%k2, %edx
106	test	%edx, %edx
107	jnz	L(exit_null_on_fourth_vector)
108
109	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
110	kmovd	%k3, %edx
111	test	%edx, %edx
112	jnz	L(exit_null_on_fifth_vector)
113
114	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
115	kmovd	%k4, %edx
116	add	$(VEC_SIZE * 4), %rax
117	test	%edx, %edx
118	jnz	L(exit_null_on_second_vector)
119
120	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
121	kmovd	%k1, %edx
122	test	%edx, %edx
123	jnz	L(exit_null_on_third_vector)
124
125	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
126	kmovd	%k2, %edx
127	test	%edx, %edx
128	jnz	L(exit_null_on_fourth_vector)
129
130	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
131	kmovd	%k3, %edx
132	test	%edx, %edx
133	jnz	L(exit_null_on_fifth_vector)
134
135	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
136	add	$(VEC_SIZE * 4), %rax
137	kmovd	%k4, %edx
138	test	%edx, %edx
139	jnz	L(exit_null_on_second_vector)
140
141	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
142	kmovd	%k1, %edx
143	test	%edx, %edx
144	jnz	L(exit_null_on_third_vector)
145
146	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
147	kmovd	%k2, %edx
148	test	%edx, %edx
149	jnz	L(exit_null_on_fourth_vector)
150
151	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
152	kmovd	%k3, %edx
153	test	%edx, %edx
154	jnz	L(exit_null_on_fifth_vector)
155
156	test	$((VEC_SIZE * 4) - 1), %rax
157	jz	L(align_four_vec_loop)
158
159	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
160	add	$(VEC_SIZE * 5), %rax
161	kmovd	%k4, %edx
162	test	%edx, %edx
163	jnz	L(exit)
164
165	test	$((VEC_SIZE * 4) - 1), %rax
166	jz	L(align_four_vec_loop)
167
168	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
169	add	$VEC_SIZE, %rax
170	kmovd	%k0, %edx
171	test	%edx, %edx
172	jnz	L(exit)
173
174	test	$((VEC_SIZE * 4) - 1), %rax
175	jz	L(align_four_vec_loop)
176
177	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
178	add	$VEC_SIZE, %rax
179	kmovd	%k0, %edx
180	test	%edx, %edx
181	jnz	L(exit)
182
183	test	$((VEC_SIZE * 4) - 1), %rax
184	jz	L(align_four_vec_loop)
185
186	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
187	add	$VEC_SIZE, %rax
188	kmovd	%k1, %edx
189	test	%edx, %edx
190	jnz	L(exit)
191
192	add	$VEC_SIZE, %rax
193
194	.p2align 4
195L(align_four_vec_loop):
196	VMOVA	(%rax), %YMM0
197	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
198	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
199	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
200	vpminub	%YMM0, %YMM1, %YMM0
201	/* If K0 != 0, there is a null byte.  */
202	vpcmpb	$0, %YMM0, %YMMZERO, %k0
203	add	$(VEC_SIZE * 4), %rax
204	ktestd	%k0, %k0
205	jz	L(align_four_vec_loop)
206
207	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
208	sub	$(VEC_SIZE * 5), %rax
209	kmovd	%k0, %edx
210	test	%edx, %edx
211	jnz	L(exit_null_on_second_vector)
212
213	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
214	kmovd	%k1, %edx
215	test	%edx, %edx
216	jnz	L(exit_null_on_third_vector)
217
218	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
219	kmovd	%k2, %edx
220	test	%edx, %edx
221	jnz	L(exit_null_on_fourth_vector)
222
223	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
224	kmovd	%k3, %edx
225	sub	%rdi, %rax
226	bsf	%rdx, %rdx
227	add	%rdx, %rax
228	add	$(VEC_SIZE * 4), %rax
229	jmp	L(StartStrcpyPart)
230
231	.p2align 4
232L(exit):
233	sub	%rdi, %rax
234L(exit_null_on_first_vector):
235	bsf	%rdx, %rdx
236	add	%rdx, %rax
237	jmp	L(StartStrcpyPart)
238
239	.p2align 4
240L(exit_null_on_second_vector):
241	sub	%rdi, %rax
242	bsf	%rdx, %rdx
243	add	%rdx, %rax
244	add	$VEC_SIZE, %rax
245	jmp	L(StartStrcpyPart)
246
247	.p2align 4
248L(exit_null_on_third_vector):
249	sub	%rdi, %rax
250	bsf	%rdx, %rdx
251	add	%rdx, %rax
252	add	$(VEC_SIZE * 2), %rax
253	jmp	L(StartStrcpyPart)
254
255	.p2align 4
256L(exit_null_on_fourth_vector):
257	sub	%rdi, %rax
258	bsf	%rdx, %rdx
259	add	%rdx, %rax
260	add	$(VEC_SIZE * 3), %rax
261	jmp	L(StartStrcpyPart)
262
263	.p2align 4
264L(exit_null_on_fifth_vector):
265	sub	%rdi, %rax
266	bsf	%rdx, %rdx
267	add	%rdx, %rax
268	add	$(VEC_SIZE * 4), %rax
269
270	.p2align 4
271L(StartStrcpyPart):
272	lea	(%r9, %rax), %rdi
273	mov	%rsi, %rcx
274	mov	%r9, %rax      /* save result */
275
276# ifdef USE_AS_STRNCAT
277	test	%r8, %r8
278	jz	L(ExitZero)
279#  define USE_AS_STRNCPY
280# endif
281
282# include "strcpy-evex.S"
283#endif
284