1/* strcat with SSE2
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCAT
24#  define STRCAT  __strcat_sse2_unaligned
25# endif
26
27# define USE_AS_STRCAT
28
29.text
30ENTRY (STRCAT)
31	mov	%rdi, %r9
32# ifdef USE_AS_STRNCAT
33	mov	%rdx, %r8
34# endif
35
36/* Inline corresponding strlen file, temporary until new strcpy
37   implementation gets merged.  */
38
39	xor	%rax, %rax
40	mov	%edi, %ecx
41	and	$0x3f, %ecx
42	pxor	%xmm0, %xmm0
43	cmp	$0x30, %ecx
44	ja	L(next)
45	movdqu	(%rdi), %xmm1
46	pcmpeqb	%xmm1, %xmm0
47	pmovmskb %xmm0, %edx
48	test	%edx, %edx
49	jnz	L(exit_less16)
50	mov	%rdi, %rax
51	and	$-16, %rax
52	jmp	L(align16_start)
53L(next):
54	mov	%rdi, %rax
55	and	$-16, %rax
56	pcmpeqb	(%rax), %xmm0
57	mov	$-1, %r10d
58	sub	%rax, %rcx
59	shl	%cl, %r10d
60	pmovmskb %xmm0, %edx
61	and	%r10d, %edx
62	jnz	L(exit)
63
64L(align16_start):
65	pxor	%xmm0, %xmm0
66	pxor	%xmm1, %xmm1
67	pxor	%xmm2, %xmm2
68	pxor	%xmm3, %xmm3
69	pcmpeqb	16(%rax), %xmm0
70	pmovmskb %xmm0, %edx
71	test	%edx, %edx
72	jnz	L(exit16)
73
74	pcmpeqb	32(%rax), %xmm1
75	pmovmskb %xmm1, %edx
76	test	%edx, %edx
77	jnz	L(exit32)
78
79	pcmpeqb	48(%rax), %xmm2
80	pmovmskb %xmm2, %edx
81	test	%edx, %edx
82	jnz	L(exit48)
83
84	pcmpeqb	64(%rax), %xmm3
85	pmovmskb %xmm3, %edx
86	test	%edx, %edx
87	jnz	L(exit64)
88
89	pcmpeqb	80(%rax), %xmm0
90	add	$64, %rax
91	pmovmskb %xmm0, %edx
92	test	%edx, %edx
93	jnz	L(exit16)
94
95	pcmpeqb	32(%rax), %xmm1
96	pmovmskb %xmm1, %edx
97	test	%edx, %edx
98	jnz	L(exit32)
99
100	pcmpeqb	48(%rax), %xmm2
101	pmovmskb %xmm2, %edx
102	test	%edx, %edx
103	jnz	L(exit48)
104
105	pcmpeqb	64(%rax), %xmm3
106	pmovmskb %xmm3, %edx
107	test	%edx, %edx
108	jnz	L(exit64)
109
110	pcmpeqb	80(%rax), %xmm0
111	add	$64, %rax
112	pmovmskb %xmm0, %edx
113	test	%edx, %edx
114	jnz	L(exit16)
115
116	pcmpeqb	32(%rax), %xmm1
117	pmovmskb %xmm1, %edx
118	test	%edx, %edx
119	jnz	L(exit32)
120
121	pcmpeqb	48(%rax), %xmm2
122	pmovmskb %xmm2, %edx
123	test	%edx, %edx
124	jnz	L(exit48)
125
126	pcmpeqb	64(%rax), %xmm3
127	pmovmskb %xmm3, %edx
128	test	%edx, %edx
129	jnz	L(exit64)
130
131	pcmpeqb	80(%rax), %xmm0
132	add	$64, %rax
133	pmovmskb %xmm0, %edx
134	test	%edx, %edx
135	jnz	L(exit16)
136
137	pcmpeqb	32(%rax), %xmm1
138	pmovmskb %xmm1, %edx
139	test	%edx, %edx
140	jnz	L(exit32)
141
142	pcmpeqb	48(%rax), %xmm2
143	pmovmskb %xmm2, %edx
144	test	%edx, %edx
145	jnz	L(exit48)
146
147	pcmpeqb	64(%rax), %xmm3
148	pmovmskb %xmm3, %edx
149	test	%edx, %edx
150	jnz	L(exit64)
151
152	test	$0x3f, %rax
153	jz	L(align64_loop)
154
155	pcmpeqb	80(%rax), %xmm0
156	add	$80, %rax
157	pmovmskb %xmm0, %edx
158	test	%edx, %edx
159	jnz	L(exit)
160
161	test	$0x3f, %rax
162	jz	L(align64_loop)
163
164	pcmpeqb	16(%rax), %xmm1
165	add	$16, %rax
166	pmovmskb %xmm1, %edx
167	test	%edx, %edx
168	jnz	L(exit)
169
170	test	$0x3f, %rax
171	jz	L(align64_loop)
172
173	pcmpeqb	16(%rax), %xmm2
174	add	$16, %rax
175	pmovmskb %xmm2, %edx
176	test	%edx, %edx
177	jnz	L(exit)
178
179	test	$0x3f, %rax
180	jz	L(align64_loop)
181
182	pcmpeqb	16(%rax), %xmm3
183	add	$16, %rax
184	pmovmskb %xmm3, %edx
185	test	%edx, %edx
186	jnz	L(exit)
187
188	add	$16, %rax
189	.p2align 4
190	L(align64_loop):
191	movaps	(%rax),	%xmm4
192	pminub	16(%rax),	%xmm4
193	movaps	32(%rax),	%xmm5
194	pminub	48(%rax),	%xmm5
195	add	$64,	%rax
196	pminub	%xmm4,	%xmm5
197	pcmpeqb	%xmm0,	%xmm5
198	pmovmskb %xmm5,	%edx
199	test	%edx,	%edx
200	jz	L(align64_loop)
201
202	pcmpeqb	-64(%rax), %xmm0
203	sub	$80,	%rax
204	pmovmskb %xmm0, %edx
205	test	%edx, %edx
206	jnz	L(exit16)
207
208	pcmpeqb	32(%rax), %xmm1
209	pmovmskb %xmm1, %edx
210	test	%edx, %edx
211	jnz	L(exit32)
212
213	pcmpeqb	48(%rax), %xmm2
214	pmovmskb %xmm2, %edx
215	test	%edx, %edx
216	jnz	L(exit48)
217
218	pcmpeqb	64(%rax), %xmm3
219	pmovmskb %xmm3, %edx
220	sub	%rdi, %rax
221	bsf	%rdx, %rdx
222	add	%rdx, %rax
223	add	$64, %rax
224	jmp	L(StartStrcpyPart)
225
226	.p2align 4
227L(exit):
228	sub	%rdi, %rax
229L(exit_less16):
230	bsf	%rdx, %rdx
231	add	%rdx, %rax
232	jmp	L(StartStrcpyPart)
233
234	.p2align 4
235L(exit16):
236	sub	%rdi, %rax
237	bsf	%rdx, %rdx
238	add	%rdx, %rax
239	add	$16, %rax
240	jmp	L(StartStrcpyPart)
241
242	.p2align 4
243L(exit32):
244	sub	%rdi, %rax
245	bsf	%rdx, %rdx
246	add	%rdx, %rax
247	add	$32, %rax
248	jmp	L(StartStrcpyPart)
249
250	.p2align 4
251L(exit48):
252	sub	%rdi, %rax
253	bsf	%rdx, %rdx
254	add	%rdx, %rax
255	add	$48, %rax
256	jmp	L(StartStrcpyPart)
257
258	.p2align 4
259L(exit64):
260	sub	%rdi, %rax
261	bsf	%rdx, %rdx
262	add	%rdx, %rax
263	add	$64, %rax
264
265	.p2align 4
266L(StartStrcpyPart):
267	lea	(%r9, %rax), %rdi
268	mov	%rsi, %rcx
269	mov	%r9, %rax      /* save result */
270
271# ifdef USE_AS_STRNCAT
272	test	%r8, %r8
273	jz	L(ExitZero)
274#  define USE_AS_STRNCPY
275# endif
276
277# include "strcpy-sse2-unaligned.S"
278#endif
279