1/* wcscpy with SSSE3
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22	.section .text.ssse3,"ax",@progbits
23ENTRY (__wcscpy_ssse3)
24
25	mov	%rsi, %rcx
26	mov	%rdi, %rdx
27
28	cmpl	$0, (%rcx)
29	jz	L(Exit4)
30	cmpl	$0, 4(%rcx)
31	jz	L(Exit8)
32	cmpl	$0, 8(%rcx)
33	jz	L(Exit12)
34	cmpl	$0, 12(%rcx)
35	jz	L(Exit16)
36
37	lea	16(%rcx), %rsi
38	and	$-16, %rsi
39
40	pxor	%xmm0, %xmm0
41	mov	(%rcx), %r9
42	mov	%r9, (%rdx)
43
44	pcmpeqd	(%rsi), %xmm0
45	mov	8(%rcx), %r9
46	mov	%r9, 8(%rdx)
47
48	pmovmskb %xmm0, %rax
49	sub	%rcx, %rsi
50
51	test	%rax, %rax
52	jnz	L(CopyFrom1To16Bytes)
53
54	mov	%rdx, %rax
55	lea	16(%rdx), %rdx
56	and	$-16, %rdx
57	sub	%rdx, %rax
58	sub	%rax, %rcx
59	mov	%rcx, %rax
60	and	$0xf, %rax
61	mov	$0, %rsi
62
63/* case: rcx_offset == rdx_offset */
64
65	jz	L(Align16Both)
66
67	cmp	$4, %rax
68	je	L(Shl4)
69	cmp	$8, %rax
70	je	L(Shl8)
71	jmp	L(Shl12)
72
73L(Align16Both):
74	movaps	(%rcx), %xmm1
75	movaps	16(%rcx), %xmm2
76	movaps	%xmm1, (%rdx)
77	pcmpeqd	%xmm2, %xmm0
78	pmovmskb %xmm0, %rax
79	lea	16(%rsi), %rsi
80
81	test	%rax, %rax
82	jnz	L(CopyFrom1To16Bytes)
83
84	movaps	16(%rcx, %rsi), %xmm3
85	movaps	%xmm2, (%rdx, %rsi)
86	pcmpeqd	%xmm3, %xmm0
87	pmovmskb %xmm0, %rax
88	lea	16(%rsi), %rsi
89
90	test	%rax, %rax
91	jnz	L(CopyFrom1To16Bytes)
92
93	movaps	16(%rcx, %rsi), %xmm4
94	movaps	%xmm3, (%rdx, %rsi)
95	pcmpeqd	%xmm4, %xmm0
96	pmovmskb %xmm0, %rax
97	lea	16(%rsi), %rsi
98
99	test	%rax, %rax
100	jnz	L(CopyFrom1To16Bytes)
101
102	movaps	16(%rcx, %rsi), %xmm1
103	movaps	%xmm4, (%rdx, %rsi)
104	pcmpeqd	%xmm1, %xmm0
105	pmovmskb %xmm0, %rax
106	lea	16(%rsi), %rsi
107
108	test	%rax, %rax
109	jnz	L(CopyFrom1To16Bytes)
110
111	movaps	16(%rcx, %rsi), %xmm2
112	movaps	%xmm1, (%rdx, %rsi)
113	pcmpeqd	%xmm2, %xmm0
114	pmovmskb %xmm0, %rax
115	lea	16(%rsi), %rsi
116
117	test	%rax, %rax
118	jnz	L(CopyFrom1To16Bytes)
119
120	movaps	16(%rcx, %rsi), %xmm3
121	movaps	%xmm2, (%rdx, %rsi)
122	pcmpeqd	%xmm3, %xmm0
123	pmovmskb %xmm0, %rax
124	lea	16(%rsi), %rsi
125
126	test	%rax, %rax
127	jnz	L(CopyFrom1To16Bytes)
128
129	movaps	%xmm3, (%rdx, %rsi)
130	mov	%rcx, %rax
131	lea	16(%rcx, %rsi), %rcx
132	and	$-0x40, %rcx
133	sub	%rcx, %rax
134	sub	%rax, %rdx
135
136	mov	$-0x40, %rsi
137
138	.p2align 4
139L(Aligned64Loop):
140	movaps	(%rcx), %xmm2
141	movaps	%xmm2, %xmm4
142	movaps	16(%rcx), %xmm5
143	movaps	32(%rcx), %xmm3
144	movaps	%xmm3, %xmm6
145	movaps	48(%rcx), %xmm7
146	pminub	%xmm5, %xmm2
147	pminub	%xmm7, %xmm3
148	pminub	%xmm2, %xmm3
149	pcmpeqd	%xmm0, %xmm3
150	pmovmskb %xmm3, %rax
151	lea	64(%rdx), %rdx
152	lea	64(%rcx), %rcx
153	test	%rax, %rax
154	jnz	L(Aligned64Leave)
155	movaps	%xmm4, -64(%rdx)
156	movaps	%xmm5, -48(%rdx)
157	movaps	%xmm6, -32(%rdx)
158	movaps	%xmm7, -16(%rdx)
159	jmp	L(Aligned64Loop)
160
161L(Aligned64Leave):
162	pcmpeqd	%xmm4, %xmm0
163	pmovmskb %xmm0, %rax
164	test	%rax, %rax
165	jnz	L(CopyFrom1To16Bytes)
166
167	pcmpeqd	%xmm5, %xmm0
168
169	pmovmskb %xmm0, %rax
170	movaps	%xmm4, -64(%rdx)
171	test	%rax, %rax
172	lea	16(%rsi), %rsi
173	jnz	L(CopyFrom1To16Bytes)
174
175	pcmpeqd	%xmm6, %xmm0
176
177	pmovmskb %xmm0, %rax
178	movaps	%xmm5, -48(%rdx)
179	test	%rax, %rax
180	lea	16(%rsi), %rsi
181	jnz	L(CopyFrom1To16Bytes)
182
183	movaps	%xmm6, -32(%rdx)
184	pcmpeqd	%xmm7, %xmm0
185
186	pmovmskb %xmm0, %rax
187	lea	16(%rsi), %rsi
188	test	%rax, %rax
189	jnz	L(CopyFrom1To16Bytes)
190
191	mov	$-0x40, %rsi
192	movaps	%xmm7, -16(%rdx)
193	jmp	L(Aligned64Loop)
194
195	.p2align 4
196L(Shl4):
197	movaps	-4(%rcx), %xmm1
198	movaps	12(%rcx), %xmm2
199L(Shl4Start):
200	pcmpeqd	%xmm2, %xmm0
201	pmovmskb %xmm0, %rax
202	movaps	%xmm2, %xmm3
203
204	test	%rax, %rax
205	jnz	L(Shl4LoopExit)
206
207	palignr	$4, %xmm1, %xmm2
208	movaps	%xmm2, (%rdx)
209	movaps	28(%rcx), %xmm2
210
211	pcmpeqd	%xmm2, %xmm0
212	lea	16(%rdx), %rdx
213	pmovmskb %xmm0, %rax
214	lea	16(%rcx), %rcx
215	movaps	%xmm2, %xmm1
216
217	test	%rax, %rax
218	jnz	L(Shl4LoopExit)
219
220	palignr	$4, %xmm3, %xmm2
221	movaps	%xmm2, (%rdx)
222	movaps	28(%rcx), %xmm2
223
224	pcmpeqd	%xmm2, %xmm0
225	lea	16(%rdx), %rdx
226	pmovmskb %xmm0, %rax
227	lea	16(%rcx), %rcx
228	movaps	%xmm2, %xmm3
229
230	test	%rax, %rax
231	jnz	L(Shl4LoopExit)
232
233	palignr	$4, %xmm1, %xmm2
234	movaps	%xmm2, (%rdx)
235	movaps	28(%rcx), %xmm2
236
237	pcmpeqd	%xmm2, %xmm0
238	lea	16(%rdx), %rdx
239	pmovmskb %xmm0, %rax
240	lea	16(%rcx), %rcx
241
242	test	%rax, %rax
243	jnz	L(Shl4LoopExit)
244
245	palignr	$4, %xmm3, %xmm2
246	movaps	%xmm2, (%rdx)
247	lea	28(%rcx), %rcx
248	lea	16(%rdx), %rdx
249
250	mov	%rcx, %rax
251	and	$-0x40, %rcx
252	sub	%rcx, %rax
253	lea	-12(%rcx), %rcx
254	sub	%rax, %rdx
255
256	movaps	-4(%rcx), %xmm1
257
258	.p2align 4
259L(Shl4LoopStart):
260	movaps	12(%rcx), %xmm2
261	movaps	28(%rcx), %xmm3
262	movaps	%xmm3, %xmm6
263	movaps	44(%rcx), %xmm4
264	movaps	%xmm4, %xmm7
265	movaps	60(%rcx), %xmm5
266	pminub	%xmm2, %xmm6
267	pminub	%xmm5, %xmm7
268	pminub	%xmm6, %xmm7
269	pcmpeqd	%xmm0, %xmm7
270	pmovmskb %xmm7, %rax
271	movaps	%xmm5, %xmm7
272	palignr	$4, %xmm4, %xmm5
273	test	%rax, %rax
274	palignr	$4, %xmm3, %xmm4
275	jnz	L(Shl4Start)
276
277	palignr	$4, %xmm2, %xmm3
278	lea	64(%rcx), %rcx
279	palignr	$4, %xmm1, %xmm2
280	movaps	%xmm7, %xmm1
281	movaps	%xmm5, 48(%rdx)
282	movaps	%xmm4, 32(%rdx)
283	movaps	%xmm3, 16(%rdx)
284	movaps	%xmm2, (%rdx)
285	lea	64(%rdx), %rdx
286	jmp	L(Shl4LoopStart)
287
288L(Shl4LoopExit):
289	movdqu	-4(%rcx), %xmm1
290	mov	$12, %rsi
291	movdqu	%xmm1, -4(%rdx)
292	jmp	L(CopyFrom1To16Bytes)
293
294	.p2align 4
295L(Shl8):
296	movaps	-8(%rcx), %xmm1
297	movaps	8(%rcx), %xmm2
298L(Shl8Start):
299	pcmpeqd	%xmm2, %xmm0
300	pmovmskb %xmm0, %rax
301	movaps	%xmm2, %xmm3
302
303	test	%rax, %rax
304	jnz	L(Shl8LoopExit)
305
306	palignr	$8, %xmm1, %xmm2
307	movaps	%xmm2, (%rdx)
308	movaps	24(%rcx), %xmm2
309
310	pcmpeqd	%xmm2, %xmm0
311	lea	16(%rdx), %rdx
312	pmovmskb %xmm0, %rax
313	lea	16(%rcx), %rcx
314	movaps	%xmm2, %xmm1
315
316	test	%rax, %rax
317	jnz	L(Shl8LoopExit)
318
319	palignr	$8, %xmm3, %xmm2
320	movaps	%xmm2, (%rdx)
321	movaps	24(%rcx), %xmm2
322
323	pcmpeqd	%xmm2, %xmm0
324	lea	16(%rdx), %rdx
325	pmovmskb %xmm0, %rax
326	lea	16(%rcx), %rcx
327	movaps	%xmm2, %xmm3
328
329	test	%rax, %rax
330	jnz	L(Shl8LoopExit)
331
332	palignr	$8, %xmm1, %xmm2
333	movaps	%xmm2, (%rdx)
334	movaps	24(%rcx), %xmm2
335
336	pcmpeqd	%xmm2, %xmm0
337	lea	16(%rdx), %rdx
338	pmovmskb %xmm0, %rax
339	lea	16(%rcx), %rcx
340
341	test	%rax, %rax
342	jnz	L(Shl8LoopExit)
343
344	palignr	$8, %xmm3, %xmm2
345	movaps	%xmm2, (%rdx)
346	lea	24(%rcx), %rcx
347	lea	16(%rdx), %rdx
348
349	mov	%rcx, %rax
350	and	$-0x40, %rcx
351	sub	%rcx, %rax
352	lea	-8(%rcx), %rcx
353	sub	%rax, %rdx
354
355	movaps	-8(%rcx), %xmm1
356
357	.p2align 4
358L(Shl8LoopStart):
359	movaps	8(%rcx), %xmm2
360	movaps	24(%rcx), %xmm3
361	movaps	%xmm3, %xmm6
362	movaps	40(%rcx), %xmm4
363	movaps	%xmm4, %xmm7
364	movaps	56(%rcx), %xmm5
365	pminub	%xmm2, %xmm6
366	pminub	%xmm5, %xmm7
367	pminub	%xmm6, %xmm7
368	pcmpeqd	%xmm0, %xmm7
369	pmovmskb %xmm7, %rax
370	movaps	%xmm5, %xmm7
371	palignr	$8, %xmm4, %xmm5
372	test	%rax, %rax
373	palignr	$8, %xmm3, %xmm4
374	jnz	L(Shl8Start)
375
376	palignr	$8, %xmm2, %xmm3
377	lea	64(%rcx), %rcx
378	palignr	$8, %xmm1, %xmm2
379	movaps	%xmm7, %xmm1
380	movaps	%xmm5, 48(%rdx)
381	movaps	%xmm4, 32(%rdx)
382	movaps	%xmm3, 16(%rdx)
383	movaps	%xmm2, (%rdx)
384	lea	64(%rdx), %rdx
385	jmp	L(Shl8LoopStart)
386
387L(Shl8LoopExit):
388	mov	(%rcx), %r9
389	mov	$8, %rsi
390	mov	%r9, (%rdx)
391	jmp	L(CopyFrom1To16Bytes)
392
393	.p2align 4
394L(Shl12):
395	movaps	-12(%rcx), %xmm1
396	movaps	4(%rcx), %xmm2
397L(Shl12Start):
398	pcmpeqd	%xmm2, %xmm0
399	pmovmskb %xmm0, %rax
400	movaps	%xmm2, %xmm3
401
402	test	%rax, %rax
403	jnz	L(Shl12LoopExit)
404
405	palignr	$12, %xmm1, %xmm2
406	movaps	%xmm2, (%rdx)
407	movaps	20(%rcx), %xmm2
408
409	pcmpeqd	%xmm2, %xmm0
410	lea	16(%rdx), %rdx
411	pmovmskb %xmm0, %rax
412	lea	16(%rcx), %rcx
413	movaps	%xmm2, %xmm1
414
415	test	%rax, %rax
416	jnz	L(Shl12LoopExit)
417
418	palignr	$12, %xmm3, %xmm2
419	movaps	%xmm2, (%rdx)
420	movaps	20(%rcx), %xmm2
421
422	pcmpeqd	%xmm2, %xmm0
423	lea	16(%rdx), %rdx
424	pmovmskb %xmm0, %rax
425	lea	16(%rcx), %rcx
426	movaps	%xmm2, %xmm3
427
428	test	%rax, %rax
429	jnz	L(Shl12LoopExit)
430
431	palignr	$12, %xmm1, %xmm2
432	movaps	%xmm2, (%rdx)
433	movaps	20(%rcx), %xmm2
434
435	pcmpeqd	%xmm2, %xmm0
436	lea	16(%rdx), %rdx
437	pmovmskb %xmm0, %rax
438	lea	16(%rcx), %rcx
439
440	test	%rax, %rax
441	jnz	L(Shl12LoopExit)
442
443	palignr	$12, %xmm3, %xmm2
444	movaps	%xmm2, (%rdx)
445	lea	20(%rcx), %rcx
446	lea	16(%rdx), %rdx
447
448	mov	%rcx, %rax
449	and	$-0x40, %rcx
450	sub	%rcx, %rax
451	lea	-4(%rcx), %rcx
452	sub	%rax, %rdx
453
454	movaps	-12(%rcx), %xmm1
455
456	.p2align 4
457L(Shl12LoopStart):
458	movaps	4(%rcx), %xmm2
459	movaps	20(%rcx), %xmm3
460	movaps	%xmm3, %xmm6
461	movaps	36(%rcx), %xmm4
462	movaps	%xmm4, %xmm7
463	movaps	52(%rcx), %xmm5
464	pminub	%xmm2, %xmm6
465	pminub	%xmm5, %xmm7
466	pminub	%xmm6, %xmm7
467	pcmpeqd	%xmm0, %xmm7
468	pmovmskb %xmm7, %rax
469	movaps	%xmm5, %xmm7
470	palignr	$12, %xmm4, %xmm5
471	test	%rax, %rax
472	palignr	$12, %xmm3, %xmm4
473	jnz	L(Shl12Start)
474	palignr	$12, %xmm2, %xmm3
475	lea	64(%rcx), %rcx
476	palignr	$12, %xmm1, %xmm2
477	movaps	%xmm7, %xmm1
478	movaps	%xmm5, 48(%rdx)
479	movaps	%xmm4, 32(%rdx)
480	movaps	%xmm3, 16(%rdx)
481	movaps	%xmm2, (%rdx)
482	lea	64(%rdx), %rdx
483	jmp	L(Shl12LoopStart)
484
485L(Shl12LoopExit):
486	mov	(%rcx), %r9d
487	mov	$4, %rsi
488	mov	%r9d, (%rdx)
489	jmp	L(CopyFrom1To16Bytes)
490
491	.p2align 4
492L(CopyFrom1To16Bytes):
493	add	%rsi, %rdx
494	add	%rsi, %rcx
495
496	test	%al, %al
497	jz	L(ExitHigh)
498	test	$0x01, %al
499	jnz	L(Exit4)
500
501	mov	(%rcx), %rax
502	mov	%rax, (%rdx)
503	mov	%rdi, %rax
504	ret
505
506	.p2align 4
507L(ExitHigh):
508	test	$0x01, %ah
509	jnz	L(Exit12)
510
511	mov	(%rcx), %rax
512	mov	%rax, (%rdx)
513	mov	8(%rcx), %rax
514	mov	%rax, 8(%rdx)
515	mov	%rdi, %rax
516	ret
517
518	.p2align 4
519L(Exit4):
520	movl	(%rcx), %eax
521	movl	%eax, (%rdx)
522	mov	%rdi, %rax
523	ret
524
525	.p2align 4
526L(Exit8):
527	mov	(%rcx), %rax
528	mov	%rax, (%rdx)
529	mov	%rdi, %rax
530	ret
531
532	.p2align 4
533L(Exit12):
534	mov	(%rcx), %rax
535	mov	%rax, (%rdx)
536	mov	8(%rcx), %eax
537	mov	%eax, 8(%rdx)
538	mov	%rdi, %rax
539	ret
540
541	.p2align 4
542L(Exit16):
543	mov	(%rcx), %rax
544	mov	%rax, (%rdx)
545	mov	8(%rcx), %rax
546	mov	%rax, 8(%rdx)
547	mov	%rdi, %rax
548	ret
549
550END(__wcscpy_ssse3)
551#endif
552