1/* Optimized wcslen for x86-64 with SSE2.
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21	.text
22ENTRY (__wcslen)
23	cmpl	$0, (%rdi)
24	jz	L(exit_tail0)
25	cmpl	$0, 4(%rdi)
26	jz	L(exit_tail1)
27	cmpl	$0, 8(%rdi)
28	jz	L(exit_tail2)
29	cmpl	$0, 12(%rdi)
30	jz	L(exit_tail3)
31	cmpl	$0, 16(%rdi)
32	jz	L(exit_tail4)
33	cmpl	$0, 20(%rdi)
34	jz	L(exit_tail5)
35	cmpl	$0, 24(%rdi)
36	jz	L(exit_tail6)
37	cmpl	$0, 28(%rdi)
38	jz	L(exit_tail7)
39
40	pxor	%xmm0, %xmm0
41
42	lea	32(%rdi), %rax
43	lea	16(%rdi), %rcx
44	and	$-16, %rax
45
46	pcmpeqd	(%rax), %xmm0
47	pmovmskb %xmm0, %edx
48	pxor	%xmm1, %xmm1
49	test	%edx, %edx
50	lea	16(%rax), %rax
51	jnz	L(exit)
52
53	pcmpeqd	(%rax), %xmm1
54	pmovmskb %xmm1, %edx
55	pxor	%xmm2, %xmm2
56	test	%edx, %edx
57	lea	16(%rax), %rax
58	jnz	L(exit)
59
60	pcmpeqd	(%rax), %xmm2
61	pmovmskb %xmm2, %edx
62	pxor	%xmm3, %xmm3
63	test	%edx, %edx
64	lea	16(%rax), %rax
65	jnz	L(exit)
66
67	pcmpeqd	(%rax), %xmm3
68	pmovmskb %xmm3, %edx
69	test	%edx, %edx
70	lea	16(%rax), %rax
71	jnz	L(exit)
72
73	pcmpeqd	(%rax), %xmm0
74	pmovmskb %xmm0, %edx
75	test	%edx, %edx
76	lea	16(%rax), %rax
77	jnz	L(exit)
78
79	pcmpeqd	(%rax), %xmm1
80	pmovmskb %xmm1, %edx
81	test	%edx, %edx
82	lea	16(%rax), %rax
83	jnz	L(exit)
84
85	pcmpeqd	(%rax), %xmm2
86	pmovmskb %xmm2, %edx
87	test	%edx, %edx
88	lea	16(%rax), %rax
89	jnz	L(exit)
90
91	pcmpeqd	(%rax), %xmm3
92	pmovmskb %xmm3, %edx
93	test	%edx, %edx
94	lea	16(%rax), %rax
95	jnz	L(exit)
96
97	pcmpeqd	(%rax), %xmm0
98	pmovmskb %xmm0, %edx
99	test	%edx, %edx
100	lea	16(%rax), %rax
101	jnz	L(exit)
102
103	pcmpeqd	(%rax), %xmm1
104	pmovmskb %xmm1, %edx
105	test	%edx, %edx
106	lea	16(%rax), %rax
107	jnz	L(exit)
108
109	pcmpeqd	(%rax), %xmm2
110	pmovmskb %xmm2, %edx
111	test	%edx, %edx
112	lea	16(%rax), %rax
113	jnz	L(exit)
114
115	pcmpeqd	(%rax), %xmm3
116	pmovmskb %xmm3, %edx
117	test	%edx, %edx
118	lea	16(%rax), %rax
119	jnz	L(exit)
120
121	and	$-0x40, %rax
122
123	.p2align 4
124L(aligned_64_loop):
125	movaps	(%rax), %xmm0
126	movaps	16(%rax), %xmm1
127	movaps	32(%rax), %xmm2
128	movaps	48(%rax), %xmm6
129
130	pminub	%xmm1, %xmm0
131	pminub	%xmm6, %xmm2
132	pminub	%xmm0, %xmm2
133	pcmpeqd	%xmm3, %xmm2
134	pmovmskb %xmm2, %edx
135	test	%edx, %edx
136	lea	64(%rax), %rax
137	jz	L(aligned_64_loop)
138
139	pcmpeqd	-64(%rax), %xmm3
140	pmovmskb %xmm3, %edx
141	test	%edx, %edx
142	lea	48(%rcx), %rcx
143	jnz	L(exit)
144
145	pcmpeqd	%xmm1, %xmm3
146	pmovmskb %xmm3, %edx
147	test	%edx, %edx
148	lea	-16(%rcx), %rcx
149	jnz	L(exit)
150
151	pcmpeqd	-32(%rax), %xmm3
152	pmovmskb %xmm3, %edx
153	test	%edx, %edx
154	lea	-16(%rcx), %rcx
155	jnz	L(exit)
156
157	pcmpeqd	%xmm6, %xmm3
158	pmovmskb %xmm3, %edx
159	test	%edx, %edx
160	lea	-16(%rcx), %rcx
161	jnz	L(exit)
162
163	jmp	L(aligned_64_loop)
164
165	.p2align 4
166L(exit):
167	sub	%rcx, %rax
168	shr	$2, %rax
169	test	%dl, %dl
170	jz	L(exit_high)
171
172	mov	%dl, %cl
173	and	$15, %cl
174	jz	L(exit_1)
175	ret
176
177	.p2align 4
178L(exit_high):
179	mov	%dh, %ch
180	and	$15, %ch
181	jz	L(exit_3)
182	add	$2, %rax
183	ret
184
185	.p2align 4
186L(exit_1):
187	add	$1, %rax
188	ret
189
190	.p2align 4
191L(exit_3):
192	add	$3, %rax
193	ret
194
195	.p2align 4
196L(exit_tail0):
197	xor	%rax, %rax
198	ret
199
200	.p2align 4
201L(exit_tail1):
202	mov	$1, %rax
203	ret
204
205	.p2align 4
206L(exit_tail2):
207	mov	$2, %rax
208	ret
209
210	.p2align 4
211L(exit_tail3):
212	mov	$3, %rax
213	ret
214
215	.p2align 4
216L(exit_tail4):
217	mov	$4, %rax
218	ret
219
220	.p2align 4
221L(exit_tail5):
222	mov	$5, %rax
223	ret
224
225	.p2align 4
226L(exit_tail6):
227	mov	$6, %rax
228	ret
229
230	.p2align 4
231L(exit_tail7):
232	mov	$7, %rax
233	ret
234
235END (__wcslen)
236
237weak_alias(__wcslen, wcslen)
238