1/* strcat with AVX2
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24#  define STRCAT  __strcat_avx2
25# endif
26
27# define USE_AS_STRCAT
28
29/* Number of bytes in a vector register */
30# define VEC_SIZE	32
31
32# ifndef SECTION
33#  define SECTION(p)	p##.avx
34# endif
35
36	.section SECTION(.text),"ax",@progbits
37ENTRY (STRCAT)
38	mov	%rdi, %r9
39# ifdef USE_AS_STRNCAT
40	mov	%rdx, %r8
41# endif
42
43	xor	%eax, %eax
44	mov	%edi, %ecx
45	and	$((VEC_SIZE * 4) - 1), %ecx
46	vpxor	%xmm6, %xmm6, %xmm6
47	cmp	$(VEC_SIZE * 3), %ecx
48	ja	L(fourth_vector_boundary)
49	vpcmpeqb (%rdi), %ymm6, %ymm0
50	vpmovmskb %ymm0, %edx
51	test	%edx, %edx
52	jnz	L(exit_null_on_first_vector)
53	mov	%rdi, %rax
54	and	$-VEC_SIZE, %rax
55	jmp	L(align_vec_size_start)
56L(fourth_vector_boundary):
57	mov	%rdi, %rax
58	and	$-VEC_SIZE, %rax
59	vpcmpeqb	(%rax), %ymm6, %ymm0
60	mov	$-1, %r10d
61	sub	%rax, %rcx
62	shl	%cl, %r10d
63	vpmovmskb %ymm0, %edx
64	and	%r10d, %edx
65	jnz	L(exit)
66
67L(align_vec_size_start):
68	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
69	vpmovmskb %ymm0, %edx
70	test	%edx, %edx
71	jnz	L(exit_null_on_second_vector)
72
73	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
74	vpmovmskb %ymm1, %edx
75	test	%edx, %edx
76	jnz	L(exit_null_on_third_vector)
77
78	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
79	vpmovmskb %ymm2, %edx
80	test	%edx, %edx
81	jnz	L(exit_null_on_fourth_vector)
82
83	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
84	vpmovmskb %ymm3, %edx
85	test	%edx, %edx
86	jnz	L(exit_null_on_fifth_vector)
87
88	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
89	add	$(VEC_SIZE * 4), %rax
90	vpmovmskb %ymm0, %edx
91	test	%edx, %edx
92	jnz	L(exit_null_on_second_vector)
93
94	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
95	vpmovmskb %ymm1, %edx
96	test	%edx, %edx
97	jnz	L(exit_null_on_third_vector)
98
99	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
100	vpmovmskb %ymm2, %edx
101	test	%edx, %edx
102	jnz	L(exit_null_on_fourth_vector)
103
104	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
105	vpmovmskb %ymm3, %edx
106	test	%edx, %edx
107	jnz	L(exit_null_on_fifth_vector)
108
109	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
110	add	$(VEC_SIZE * 4), %rax
111	vpmovmskb %ymm0, %edx
112	test	%edx, %edx
113	jnz	L(exit_null_on_second_vector)
114
115	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
116	vpmovmskb %ymm1, %edx
117	test	%edx, %edx
118	jnz	L(exit_null_on_third_vector)
119
120	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
121	vpmovmskb %ymm2, %edx
122	test	%edx, %edx
123	jnz	L(exit_null_on_fourth_vector)
124
125	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
126	vpmovmskb %ymm3, %edx
127	test	%edx, %edx
128	jnz	L(exit_null_on_fifth_vector)
129
130	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
131	add	$(VEC_SIZE * 4), %rax
132	vpmovmskb %ymm0, %edx
133	test	%edx, %edx
134	jnz	L(exit_null_on_second_vector)
135
136	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
137	vpmovmskb %ymm1, %edx
138	test	%edx, %edx
139	jnz	L(exit_null_on_third_vector)
140
141	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
142	vpmovmskb %ymm2, %edx
143	test	%edx, %edx
144	jnz	L(exit_null_on_fourth_vector)
145
146	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
147	vpmovmskb %ymm3, %edx
148	test	%edx, %edx
149	jnz	L(exit_null_on_fifth_vector)
150
151	test	$((VEC_SIZE * 4) - 1), %rax
152	jz	L(align_four_vec_loop)
153
154	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
155	add	$(VEC_SIZE * 5), %rax
156	vpmovmskb %ymm0, %edx
157	test	%edx, %edx
158	jnz	L(exit)
159
160	test	$((VEC_SIZE * 4) - 1), %rax
161	jz	L(align_four_vec_loop)
162
163	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
164	add	$VEC_SIZE, %rax
165	vpmovmskb %ymm1, %edx
166	test	%edx, %edx
167	jnz	L(exit)
168
169	test	$((VEC_SIZE * 4) - 1), %rax
170	jz	L(align_four_vec_loop)
171
172	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
173	add	$VEC_SIZE, %rax
174	vpmovmskb %ymm2, %edx
175	test	%edx, %edx
176	jnz	L(exit)
177
178	test	$((VEC_SIZE * 4) - 1), %rax
179	jz	L(align_four_vec_loop)
180
181	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
182	add	$VEC_SIZE, %rax
183	vpmovmskb %ymm3, %edx
184	test	%edx, %edx
185	jnz	L(exit)
186
187	add	$VEC_SIZE, %rax
188
189	.p2align 4
190L(align_four_vec_loop):
191	vmovaps	(%rax),	%ymm4
192	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
193	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
194	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
195	add	$(VEC_SIZE * 4),	%rax
196	vpminub	%ymm4,	%ymm5, %ymm5
197	vpcmpeqb %ymm5,	%ymm6, %ymm5
198	vpmovmskb %ymm5,	%edx
199	test	%edx,	%edx
200	jz	L(align_four_vec_loop)
201
202	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
203	sub	$(VEC_SIZE * 5),	%rax
204	vpmovmskb %ymm0, %edx
205	test	%edx, %edx
206	jnz	L(exit_null_on_second_vector)
207
208	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
209	vpmovmskb %ymm1, %edx
210	test	%edx, %edx
211	jnz	L(exit_null_on_third_vector)
212
213	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
214	vpmovmskb %ymm2, %edx
215	test	%edx, %edx
216	jnz	L(exit_null_on_fourth_vector)
217
218	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
219	vpmovmskb %ymm3, %edx
220	sub	%rdi, %rax
221	bsf	%rdx, %rdx
222	add	%rdx, %rax
223	add	$(VEC_SIZE * 4), %rax
224	jmp	L(StartStrcpyPart)
225
226	.p2align 4
227L(exit):
228	sub	%rdi, %rax
229L(exit_null_on_first_vector):
230	bsf	%rdx, %rdx
231	add	%rdx, %rax
232	jmp	L(StartStrcpyPart)
233
234	.p2align 4
235L(exit_null_on_second_vector):
236	sub	%rdi, %rax
237	bsf	%rdx, %rdx
238	add	%rdx, %rax
239	add	$VEC_SIZE, %rax
240	jmp	L(StartStrcpyPart)
241
242	.p2align 4
243L(exit_null_on_third_vector):
244	sub	%rdi, %rax
245	bsf	%rdx, %rdx
246	add	%rdx, %rax
247	add	$(VEC_SIZE * 2), %rax
248	jmp	L(StartStrcpyPart)
249
250	.p2align 4
251L(exit_null_on_fourth_vector):
252	sub	%rdi, %rax
253	bsf	%rdx, %rdx
254	add	%rdx, %rax
255	add	$(VEC_SIZE * 3), %rax
256	jmp	L(StartStrcpyPart)
257
258	.p2align 4
259L(exit_null_on_fifth_vector):
260	sub	%rdi, %rax
261	bsf	%rdx, %rdx
262	add	%rdx, %rax
263	add	$(VEC_SIZE * 4), %rax
264
265	.p2align 4
266L(StartStrcpyPart):
267	lea	(%r9, %rax), %rdi
268	mov	%rsi, %rcx
269	mov	%r9, %rax      /* save result */
270
271# ifdef USE_AS_STRNCAT
272	test	%r8, %r8
273	jz	L(ExitZero)
274#  define USE_AS_STRNCPY
275# endif
276
277# include "strcpy-avx2.S"
278#endif
279