1/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
2
3   Copyright (C) 2011-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22	.text
23ENTRY (__rawmemchr)
24	movd	%rsi, %xmm1
25	mov	%rdi, %rcx
26
27	punpcklbw %xmm1, %xmm1
28	punpcklbw %xmm1, %xmm1
29
30	and	$63, %rcx
31	pshufd	$0, %xmm1, %xmm1
32
33	cmp	$48, %rcx
34	ja	L(crosscache)
35
36	movdqu	(%rdi), %xmm0
37	pcmpeqb	%xmm1, %xmm0
38/* Check if there is a match.  */
39	pmovmskb %xmm0, %eax
40	test	%eax, %eax
41
42	jnz	L(matches)
43	add	$16, %rdi
44	and	$-16, %rdi
45	jmp	L(loop_prolog)
46
47	.p2align 4
48L(crosscache):
49	and	$15, %rcx
50	and	$-16, %rdi
51	movdqa	(%rdi), %xmm0
52
53	pcmpeqb	%xmm1, %xmm0
54/* Check if there is a match.  */
55	pmovmskb %xmm0, %eax
56/* Remove the leading bytes.  */
57	sar	%cl, %eax
58	test	%eax, %eax
59	je	L(unaligned_no_match)
60/* Check which byte is a match.  */
61	bsf	%eax, %eax
62
63	add	%rdi, %rax
64	add	%rcx, %rax
65	ret
66
67	.p2align 4
68L(unaligned_no_match):
69	add	$16, %rdi
70
71	.p2align 4
72L(loop_prolog):
73	movdqa	(%rdi), %xmm0
74	pcmpeqb	%xmm1, %xmm0
75	pmovmskb %xmm0, %eax
76	test	%eax, %eax
77	jnz	L(matches)
78
79	movdqa	16(%rdi), %xmm2
80	pcmpeqb	%xmm1, %xmm2
81	pmovmskb %xmm2, %eax
82	test	%eax, %eax
83	jnz	L(matches16)
84
85	movdqa	32(%rdi), %xmm3
86	pcmpeqb	%xmm1, %xmm3
87	pmovmskb %xmm3, %eax
88	test	%eax, %eax
89	jnz	L(matches32)
90
91	movdqa	48(%rdi), %xmm4
92	pcmpeqb	%xmm1, %xmm4
93	add	$64, %rdi
94	pmovmskb %xmm4, %eax
95	test	%eax, %eax
96	jnz	L(matches0)
97
98	test	$0x3f, %rdi
99	jz	L(align64_loop)
100
101	movdqa	(%rdi), %xmm0
102	pcmpeqb	%xmm1, %xmm0
103	pmovmskb %xmm0, %eax
104	test	%eax, %eax
105	jnz	L(matches)
106
107	movdqa	16(%rdi), %xmm2
108	pcmpeqb	%xmm1, %xmm2
109	pmovmskb %xmm2, %eax
110	test	%eax, %eax
111	jnz	L(matches16)
112
113	movdqa	32(%rdi), %xmm3
114	pcmpeqb	%xmm1, %xmm3
115	pmovmskb %xmm3, %eax
116	test	%eax, %eax
117	jnz	L(matches32)
118
119	movdqa	48(%rdi), %xmm3
120	pcmpeqb	%xmm1, %xmm3
121	pmovmskb %xmm3, %eax
122
123	add	$64, %rdi
124	test	%eax, %eax
125	jnz	L(matches0)
126
127	and	$-64, %rdi
128
129	.p2align 4
130L(align64_loop):
131	movdqa	(%rdi), %xmm0
132	movdqa	16(%rdi), %xmm2
133	movdqa	32(%rdi), %xmm3
134	movdqa	48(%rdi), %xmm4
135
136	pcmpeqb	%xmm1, %xmm0
137	pcmpeqb	%xmm1, %xmm2
138	pcmpeqb	%xmm1, %xmm3
139	pcmpeqb	%xmm1, %xmm4
140
141	pmaxub	%xmm0, %xmm3
142	pmaxub	%xmm2, %xmm4
143	pmaxub	%xmm3, %xmm4
144	pmovmskb %xmm4, %eax
145
146	add	$64, %rdi
147
148	test	%eax, %eax
149	jz	L(align64_loop)
150
151	sub	$64, %rdi
152
153	pmovmskb %xmm0, %eax
154	test	%eax, %eax
155	jnz	L(matches)
156
157	pmovmskb %xmm2, %eax
158	test	%eax, %eax
159	jnz	L(matches16)
160
161	movdqa	32(%rdi), %xmm3
162	pcmpeqb	%xmm1, %xmm3
163
164	pcmpeqb	48(%rdi), %xmm1
165	pmovmskb %xmm3, %eax
166	test	%eax, %eax
167	jnz	L(matches32)
168
169	pmovmskb %xmm1, %eax
170	bsf	%eax, %eax
171	lea	48(%rdi, %rax), %rax
172	ret
173
174	.p2align 4
175L(matches0):
176	bsf	%eax, %eax
177	lea	-16(%rax, %rdi), %rax
178	ret
179
180	.p2align 4
181L(matches):
182	bsf	%eax, %eax
183	add	%rdi, %rax
184	ret
185
186	.p2align 4
187L(matches16):
188	bsf	%eax, %eax
189	lea	16(%rax, %rdi), %rax
190	ret
191
192	.p2align 4
193L(matches32):
194	bsf	%eax, %eax
195	lea	32(%rax, %rdi), %rax
196	ret
197
198END (__rawmemchr)
199
200weak_alias (__rawmemchr, rawmemchr)
201libc_hidden_builtin_def (__rawmemchr)
202