1/* Copyright (C) 2011-2021 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library; if not, see
16   <https://www.gnu.org/licenses/>.  */
17
18#include <sysdep.h>
19
20#ifdef USE_AS_WMEMCHR
21# define MEMCHR		wmemchr
22# define PCMPEQ		pcmpeqd
23# define CHAR_PER_VEC	4
24#else
25# define MEMCHR		memchr
26# define PCMPEQ		pcmpeqb
27# define CHAR_PER_VEC	16
28#endif
29
30/* fast SSE2 version with using pmaxub and 64 byte loop */
31
32	.text
33ENTRY(MEMCHR)
34	movd	%esi, %xmm1
35	mov	%edi, %ecx
36
37#ifdef __ILP32__
38	/* Clear the upper 32 bits.  */
39	movl	%edx, %edx
40#endif
41#ifdef USE_AS_WMEMCHR
42	test	%RDX_LP, %RDX_LP
43	jz	L(return_null)
44#else
45	punpcklbw %xmm1, %xmm1
46	test	%RDX_LP, %RDX_LP
47	jz	L(return_null)
48	punpcklbw %xmm1, %xmm1
49#endif
50
51	and	$63, %ecx
52	pshufd	$0, %xmm1, %xmm1
53
54	cmp	$48, %ecx
55	ja	L(crosscache)
56
57	movdqu	(%rdi), %xmm0
58	PCMPEQ	%xmm1, %xmm0
59	pmovmskb %xmm0, %eax
60	test	%eax, %eax
61
62	jnz	L(matches_1)
63	sub	$CHAR_PER_VEC, %rdx
64	jbe	L(return_null)
65	add	$16, %rdi
66	and	$15, %ecx
67	and	$-16, %rdi
68#ifdef USE_AS_WMEMCHR
69	shr	$2, %ecx
70#endif
71	add	%rcx, %rdx
72	sub	$(CHAR_PER_VEC * 4), %rdx
73	jbe	L(exit_loop)
74	jmp	L(loop_prolog)
75
76	.p2align 4
77L(crosscache):
78	and	$15, %ecx
79	and	$-16, %rdi
80	movdqa	(%rdi), %xmm0
81
82	PCMPEQ	%xmm1, %xmm0
83	/* Check if there is a match.  */
84	pmovmskb %xmm0, %eax
85	/* Remove the leading bytes.  */
86	sar	%cl, %eax
87	test	%eax, %eax
88	je	L(unaligned_no_match)
89	/* Check which byte is a match.  */
90	bsf	%eax, %eax
91#ifdef USE_AS_WMEMCHR
92	mov	%eax, %esi
93	shr	$2, %esi
94	sub	%rsi, %rdx
95#else
96	sub	%rax, %rdx
97#endif
98	jbe	L(return_null)
99	add	%rdi, %rax
100	add	%rcx, %rax
101	ret
102
103	.p2align 4
104L(unaligned_no_match):
105	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
106	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
107	   possible addition overflow.  */
108	neg	%rcx
109	add	$16, %rcx
110#ifdef USE_AS_WMEMCHR
111	shr	$2, %ecx
112#endif
113	sub	%rcx, %rdx
114	jbe	L(return_null)
115	add	$16, %rdi
116	sub	$(CHAR_PER_VEC * 4), %rdx
117	jbe	L(exit_loop)
118
119	.p2align 4
120L(loop_prolog):
121	movdqa	(%rdi), %xmm0
122	PCMPEQ	%xmm1, %xmm0
123	pmovmskb %xmm0, %eax
124	test	%eax, %eax
125	jnz	L(matches)
126
127	movdqa	16(%rdi), %xmm2
128	PCMPEQ	%xmm1, %xmm2
129	pmovmskb %xmm2, %eax
130	test	%eax, %eax
131	jnz	L(matches16)
132
133	movdqa	32(%rdi), %xmm3
134	PCMPEQ	%xmm1, %xmm3
135	pmovmskb %xmm3, %eax
136	test	%eax, %eax
137	jnz	L(matches32)
138
139	movdqa	48(%rdi), %xmm4
140	PCMPEQ	%xmm1, %xmm4
141	add	$64, %rdi
142	pmovmskb %xmm4, %eax
143	test	%eax, %eax
144	jnz	L(matches0)
145
146	test	$0x3f, %rdi
147	jz	L(align64_loop)
148
149	sub	$(CHAR_PER_VEC * 4), %rdx
150	jbe	L(exit_loop)
151
152	movdqa	(%rdi), %xmm0
153	PCMPEQ	%xmm1, %xmm0
154	pmovmskb %xmm0, %eax
155	test	%eax, %eax
156	jnz	L(matches)
157
158	movdqa	16(%rdi), %xmm2
159	PCMPEQ	%xmm1, %xmm2
160	pmovmskb %xmm2, %eax
161	test	%eax, %eax
162	jnz	L(matches16)
163
164	movdqa	32(%rdi), %xmm3
165	PCMPEQ	%xmm1, %xmm3
166	pmovmskb %xmm3, %eax
167	test	%eax, %eax
168	jnz	L(matches32)
169
170	movdqa	48(%rdi), %xmm3
171	PCMPEQ	%xmm1, %xmm3
172	pmovmskb %xmm3, %eax
173
174	add	$64, %rdi
175	test	%eax, %eax
176	jnz	L(matches0)
177
178	mov	%rdi, %rcx
179	and	$-64, %rdi
180	and	$63, %ecx
181#ifdef USE_AS_WMEMCHR
182	shr	$2, %ecx
183#endif
184	add	%rcx, %rdx
185
186	.p2align 4
187L(align64_loop):
188	sub	$(CHAR_PER_VEC * 4), %rdx
189	jbe	L(exit_loop)
190	movdqa	(%rdi), %xmm0
191	movdqa	16(%rdi), %xmm2
192	movdqa	32(%rdi), %xmm3
193	movdqa	48(%rdi), %xmm4
194
195	PCMPEQ	%xmm1, %xmm0
196	PCMPEQ	%xmm1, %xmm2
197	PCMPEQ	%xmm1, %xmm3
198	PCMPEQ	%xmm1, %xmm4
199
200	pmaxub	%xmm0, %xmm3
201	pmaxub	%xmm2, %xmm4
202	pmaxub	%xmm3, %xmm4
203	pmovmskb %xmm4, %eax
204
205	add	$64, %rdi
206
207	test	%eax, %eax
208	jz	L(align64_loop)
209
210	sub	$64, %rdi
211
212	pmovmskb %xmm0, %eax
213	test	%eax, %eax
214	jnz	L(matches)
215
216	pmovmskb %xmm2, %eax
217	test	%eax, %eax
218	jnz	L(matches16)
219
220	movdqa	32(%rdi), %xmm3
221	PCMPEQ	%xmm1, %xmm3
222
223	PCMPEQ	48(%rdi), %xmm1
224	pmovmskb %xmm3, %eax
225	test	%eax, %eax
226	jnz	L(matches32)
227
228	pmovmskb %xmm1, %eax
229	bsf	%eax, %eax
230	lea	48(%rdi, %rax), %rax
231	ret
232
233	.p2align 4
234L(exit_loop):
235	add	$(CHAR_PER_VEC * 2), %edx
236	jle	L(exit_loop_32)
237
238	movdqa	(%rdi), %xmm0
239	PCMPEQ	%xmm1, %xmm0
240	pmovmskb %xmm0, %eax
241	test	%eax, %eax
242	jnz	L(matches)
243
244	movdqa	16(%rdi), %xmm2
245	PCMPEQ	%xmm1, %xmm2
246	pmovmskb %xmm2, %eax
247	test	%eax, %eax
248	jnz	L(matches16)
249
250	movdqa	32(%rdi), %xmm3
251	PCMPEQ	%xmm1, %xmm3
252	pmovmskb %xmm3, %eax
253	test	%eax, %eax
254	jnz	L(matches32_1)
255	sub	$CHAR_PER_VEC, %edx
256	jle	L(return_null)
257
258	PCMPEQ	48(%rdi), %xmm1
259	pmovmskb %xmm1, %eax
260	test	%eax, %eax
261	jnz	L(matches48_1)
262	xor	%eax, %eax
263	ret
264
265	.p2align 4
266L(exit_loop_32):
267	add	$(CHAR_PER_VEC * 2), %edx
268	movdqa	(%rdi), %xmm0
269	PCMPEQ	%xmm1, %xmm0
270	pmovmskb %xmm0, %eax
271	test	%eax, %eax
272	jnz	L(matches_1)
273	sub	$CHAR_PER_VEC, %edx
274	jbe	L(return_null)
275
276	PCMPEQ	16(%rdi), %xmm1
277	pmovmskb %xmm1, %eax
278	test	%eax, %eax
279	jnz	L(matches16_1)
280	xor	%eax, %eax
281	ret
282
283	.p2align 4
284L(matches0):
285	bsf	%eax, %eax
286	lea	-16(%rax, %rdi), %rax
287	ret
288
289	.p2align 4
290L(matches):
291	bsf	%eax, %eax
292	add	%rdi, %rax
293	ret
294
295	.p2align 4
296L(matches16):
297	bsf	%eax, %eax
298	lea	16(%rax, %rdi), %rax
299	ret
300
301	.p2align 4
302L(matches32):
303	bsf	%eax, %eax
304	lea	32(%rax, %rdi), %rax
305	ret
306
307	.p2align 4
308L(matches_1):
309	bsf	%eax, %eax
310#ifdef USE_AS_WMEMCHR
311	mov	%eax, %esi
312	shr	$2, %esi
313	sub	%rsi, %rdx
314#else
315	sub	%rax, %rdx
316#endif
317	jbe	L(return_null)
318	add	%rdi, %rax
319	ret
320
321	.p2align 4
322L(matches16_1):
323	bsf	%eax, %eax
324#ifdef USE_AS_WMEMCHR
325	mov	%eax, %esi
326	shr	$2, %esi
327	sub	%rsi, %rdx
328#else
329	sub	%rax, %rdx
330#endif
331	jbe	L(return_null)
332	lea	16(%rdi, %rax), %rax
333	ret
334
335	.p2align 4
336L(matches32_1):
337	bsf	%eax, %eax
338#ifdef USE_AS_WMEMCHR
339	mov	%eax, %esi
340	shr	$2, %esi
341	sub	%rsi, %rdx
342#else
343	sub	%rax, %rdx
344#endif
345	jbe	L(return_null)
346	lea	32(%rdi, %rax), %rax
347	ret
348
349	.p2align 4
350L(matches48_1):
351	bsf	%eax, %eax
352#ifdef USE_AS_WMEMCHR
353	mov	%eax, %esi
354	shr	$2, %esi
355	sub	%rsi, %rdx
356#else
357	sub	%rax, %rdx
358#endif
359	jbe	L(return_null)
360	lea	48(%rdi, %rax), %rax
361	ret
362
363	.p2align 4
364L(return_null):
365	xor	%eax, %eax
366	ret
367END(MEMCHR)
368
369#ifndef USE_AS_WMEMCHR
370strong_alias (memchr, __memchr)
371libc_hidden_builtin_def(memchr)
372#endif
373