1/* wcsrchr with SSSE3
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21	.text
22ENTRY (wcsrchr)
23
24	movd	%rsi, %xmm1
25	mov	%rdi, %rcx
26	punpckldq %xmm1, %xmm1
27	pxor	%xmm2, %xmm2
28	punpckldq %xmm1, %xmm1
29	and	$63, %rcx
30	cmp	$48, %rcx
31	ja	L(crosscache)
32
33	movdqu	(%rdi), %xmm0
34	pcmpeqd	%xmm0, %xmm2
35	pcmpeqd	%xmm1, %xmm0
36	pmovmskb %xmm2, %rcx
37	pmovmskb %xmm0, %rax
38	add	$16, %rdi
39
40	test	%rax, %rax
41	jnz	L(unaligned_match1)
42
43	test	%rcx, %rcx
44	jnz	L(return_null)
45
46	and	$-16, %rdi
47	xor	%r8, %r8
48	jmp	L(loop)
49
50	.p2align 4
51L(unaligned_match1):
52	test	%rcx, %rcx
53	jnz	L(prolog_find_zero_1)
54
55	mov	%rax, %r8
56	mov	%rdi, %rsi
57	and	$-16, %rdi
58	jmp	L(loop)
59
60	.p2align 4
61L(crosscache):
62	and	$15, %rcx
63	and	$-16, %rdi
64	pxor	%xmm3, %xmm3
65	movdqa	(%rdi), %xmm0
66	pcmpeqd	%xmm0, %xmm3
67	pcmpeqd	%xmm1, %xmm0
68	pmovmskb %xmm3, %rdx
69	pmovmskb %xmm0, %rax
70	shr	%cl, %rdx
71	shr	%cl, %rax
72	add	$16, %rdi
73
74	test	%rax, %rax
75	jnz	L(unaligned_match)
76
77	test	%rdx, %rdx
78	jnz	L(return_null)
79
80	xor	%r8, %r8
81	jmp	L(loop)
82
83	.p2align 4
84L(unaligned_match):
85	test	%rdx, %rdx
86	jnz	L(prolog_find_zero)
87
88	mov	%rax, %r8
89	lea	(%rdi, %rcx), %rsi
90
91/* Loop start on aligned string.  */
92	.p2align 4
93L(loop):
94	movdqa	(%rdi), %xmm0
95	pcmpeqd	%xmm0, %xmm2
96	add	$16, %rdi
97	pcmpeqd	%xmm1, %xmm0
98	pmovmskb %xmm2, %rcx
99	pmovmskb %xmm0, %rax
100	or	%rax, %rcx
101	jnz	L(matches)
102
103	movdqa	(%rdi), %xmm3
104	pcmpeqd	%xmm3, %xmm2
105	add	$16, %rdi
106	pcmpeqd	%xmm1, %xmm3
107	pmovmskb %xmm2, %rcx
108	pmovmskb %xmm3, %rax
109	or	%rax, %rcx
110	jnz	L(matches)
111
112	movdqa	(%rdi), %xmm4
113	pcmpeqd	%xmm4, %xmm2
114	add	$16, %rdi
115	pcmpeqd	%xmm1, %xmm4
116	pmovmskb %xmm2, %rcx
117	pmovmskb %xmm4, %rax
118	or	%rax, %rcx
119	jnz	L(matches)
120
121	movdqa	(%rdi), %xmm5
122	pcmpeqd	%xmm5, %xmm2
123	add	$16, %rdi
124	pcmpeqd	%xmm1, %xmm5
125	pmovmskb %xmm2, %rcx
126	pmovmskb %xmm5, %rax
127	or	%rax, %rcx
128	jz	L(loop)
129
130	.p2align 4
131L(matches):
132	test	%rax, %rax
133	jnz	L(match)
134L(return_value):
135	test	%r8, %r8
136	jz	L(return_null)
137	mov	%r8, %rax
138	mov	%rsi, %rdi
139
140	test	$15 << 4, %ah
141	jnz	L(match_fourth_wchar)
142	test	%ah, %ah
143	jnz	L(match_third_wchar)
144	test	$15 << 4, %al
145	jnz	L(match_second_wchar)
146	lea	-16(%rdi), %rax
147	ret
148
149	.p2align 4
150L(match):
151	pmovmskb %xmm2, %rcx
152	test	%rcx, %rcx
153	jnz	L(find_zero)
154	mov	%rax, %r8
155	mov	%rdi, %rsi
156	jmp	L(loop)
157
158	.p2align 4
159L(find_zero):
160	test	$15, %cl
161	jnz	L(find_zero_in_first_wchar)
162	test	%cl, %cl
163	jnz	L(find_zero_in_second_wchar)
164	test	$15, %ch
165	jnz	L(find_zero_in_third_wchar)
166
167	and	$1 << 13 - 1, %rax
168	jz	L(return_value)
169
170	test	$15 << 4, %ah
171	jnz	L(match_fourth_wchar)
172	test	%ah, %ah
173	jnz	L(match_third_wchar)
174	test	$15 << 4, %al
175	jnz	L(match_second_wchar)
176	lea	-16(%rdi), %rax
177	ret
178
179	.p2align 4
180L(find_zero_in_first_wchar):
181	test	$1, %rax
182	jz	L(return_value)
183	lea	-16(%rdi), %rax
184	ret
185
186	.p2align 4
187L(find_zero_in_second_wchar):
188	and	$1 << 5 - 1, %rax
189	jz	L(return_value)
190
191	test	$15 << 4, %al
192	jnz	L(match_second_wchar)
193	lea	-16(%rdi), %rax
194	ret
195
196	.p2align 4
197L(find_zero_in_third_wchar):
198	and	$1 << 9 - 1, %rax
199	jz	L(return_value)
200
201	test	%ah, %ah
202	jnz	L(match_third_wchar)
203	test	$15 << 4, %al
204	jnz	L(match_second_wchar)
205	lea	-16(%rdi), %rax
206	ret
207
208	.p2align 4
209L(prolog_find_zero):
210	add	%rcx, %rdi
211	mov     %rdx, %rcx
212L(prolog_find_zero_1):
213	test	$15, %cl
214	jnz	L(prolog_find_zero_in_first_wchar)
215	test	%cl, %cl
216	jnz	L(prolog_find_zero_in_second_wchar)
217	test	$15, %ch
218	jnz	L(prolog_find_zero_in_third_wchar)
219
220	and	$1 << 13 - 1, %rax
221	jz	L(return_null)
222
223	test	$15 << 4, %ah
224	jnz	L(match_fourth_wchar)
225	test	%ah, %ah
226	jnz	L(match_third_wchar)
227	test	$15 << 4, %al
228	jnz	L(match_second_wchar)
229	lea	-16(%rdi), %rax
230	ret
231
232	.p2align 4
233L(prolog_find_zero_in_first_wchar):
234	test	$1, %rax
235	jz	L(return_null)
236	lea	-16(%rdi), %rax
237	ret
238
239	.p2align 4
240L(prolog_find_zero_in_second_wchar):
241	and	$1 << 5 - 1, %rax
242	jz	L(return_null)
243
244	test	$15 << 4, %al
245	jnz	L(match_second_wchar)
246	lea	-16(%rdi), %rax
247	ret
248
249	.p2align 4
250L(prolog_find_zero_in_third_wchar):
251	and	$1 << 9 - 1, %rax
252	jz	L(return_null)
253
254	test	%ah, %ah
255	jnz	L(match_third_wchar)
256	test	$15 << 4, %al
257	jnz	L(match_second_wchar)
258	lea	-16(%rdi), %rax
259	ret
260
261	.p2align 4
262L(match_second_wchar):
263	lea	-12(%rdi), %rax
264	ret
265
266	.p2align 4
267L(match_third_wchar):
268	lea	-8(%rdi), %rax
269	ret
270
271	.p2align 4
272L(match_fourth_wchar):
273	lea	-4(%rdi), %rax
274	ret
275
276	.p2align 4
277L(return_null):
278	xor	%rax, %rax
279	ret
280
281END (wcsrchr)
282