1/* memrchr optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMRCHR
24#  define MEMRCHR	__memrchr_avx2
25# endif
26
27# ifndef VZEROUPPER
28#  define VZEROUPPER	vzeroupper
29# endif
30
31# ifndef SECTION
32#  define SECTION(p)	p##.avx
33# endif
34
35# define VEC_SIZE 32
36
37	.section SECTION(.text),"ax",@progbits
38ENTRY (MEMRCHR)
39	/* Broadcast CHAR to YMM0.  */
40	vmovd	%esi, %xmm0
41	vpbroadcastb %xmm0, %ymm0
42
43	sub	$VEC_SIZE, %RDX_LP
44	jbe	L(last_vec_or_less)
45
46	add	%RDX_LP, %RDI_LP
47
48	/* Check the last VEC_SIZE bytes.  */
49	vpcmpeqb (%rdi), %ymm0, %ymm1
50	vpmovmskb %ymm1, %eax
51	testl	%eax, %eax
52	jnz	L(last_vec_x0)
53
54	subq	$(VEC_SIZE * 4), %rdi
55	movl	%edi, %ecx
56	andl	$(VEC_SIZE - 1), %ecx
57	jz	L(aligned_more)
58
59	/* Align data for aligned loads in the loop.  */
60	addq	$VEC_SIZE, %rdi
61	addq	$VEC_SIZE, %rdx
62	andq	$-VEC_SIZE, %rdi
63	subq	%rcx, %rdx
64
65	.p2align 4
66L(aligned_more):
67	subq	$(VEC_SIZE * 4), %rdx
68	jbe	L(last_4x_vec_or_less)
69
70	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
71	   since data is only aligned to VEC_SIZE.  */
72	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
73	vpmovmskb %ymm1, %eax
74	testl	%eax, %eax
75	jnz	L(last_vec_x3)
76
77	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
78	vpmovmskb %ymm2, %eax
79	testl	%eax, %eax
80	jnz	L(last_vec_x2)
81
82	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
83	vpmovmskb %ymm3, %eax
84	testl	%eax, %eax
85	jnz	L(last_vec_x1)
86
87	vpcmpeqb (%rdi), %ymm0, %ymm4
88	vpmovmskb %ymm4, %eax
89	testl	%eax, %eax
90	jnz	L(last_vec_x0)
91
92	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
93	   There are some overlaps with above if data isn't aligned
94	   to 4 * VEC_SIZE.  */
95	movl	%edi, %ecx
96	andl	$(VEC_SIZE * 4 - 1), %ecx
97	jz	L(loop_4x_vec)
98
99	addq	$(VEC_SIZE * 4), %rdi
100	addq	$(VEC_SIZE * 4), %rdx
101	andq	$-(VEC_SIZE * 4), %rdi
102	subq	%rcx, %rdx
103
104	.p2align 4
105L(loop_4x_vec):
106	/* Compare 4 * VEC at a time forward.  */
107	subq	$(VEC_SIZE * 4), %rdi
108	subq	$(VEC_SIZE * 4), %rdx
109	jbe	L(last_4x_vec_or_less)
110
111	vmovdqa	(%rdi), %ymm1
112	vmovdqa	VEC_SIZE(%rdi), %ymm2
113	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
114	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
115
116	vpcmpeqb %ymm1, %ymm0, %ymm1
117	vpcmpeqb %ymm2, %ymm0, %ymm2
118	vpcmpeqb %ymm3, %ymm0, %ymm3
119	vpcmpeqb %ymm4, %ymm0, %ymm4
120
121	vpor	%ymm1, %ymm2, %ymm5
122	vpor	%ymm3, %ymm4, %ymm6
123	vpor	%ymm5, %ymm6, %ymm5
124
125	vpmovmskb %ymm5, %eax
126	testl	%eax, %eax
127	jz	L(loop_4x_vec)
128
129	/* There is a match.  */
130	vpmovmskb %ymm4, %eax
131	testl	%eax, %eax
132	jnz	L(last_vec_x3)
133
134	vpmovmskb %ymm3, %eax
135	testl	%eax, %eax
136	jnz	L(last_vec_x2)
137
138	vpmovmskb %ymm2, %eax
139	testl	%eax, %eax
140	jnz	L(last_vec_x1)
141
142	vpmovmskb %ymm1, %eax
143	bsrl	%eax, %eax
144	addq	%rdi, %rax
145L(return_vzeroupper):
146	ZERO_UPPER_VEC_REGISTERS_RETURN
147
148	.p2align 4
149L(last_4x_vec_or_less):
150	addl	$(VEC_SIZE * 4), %edx
151	cmpl	$(VEC_SIZE * 2), %edx
152	jbe	L(last_2x_vec)
153
154	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
155	vpmovmskb %ymm1, %eax
156	testl	%eax, %eax
157	jnz	L(last_vec_x3)
158
159	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
160	vpmovmskb %ymm2, %eax
161	testl	%eax, %eax
162	jnz	L(last_vec_x2)
163
164	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
165	vpmovmskb %ymm3, %eax
166	testl	%eax, %eax
167	jnz	L(last_vec_x1_check)
168	cmpl	$(VEC_SIZE * 3), %edx
169	jbe	L(zero)
170
171	vpcmpeqb (%rdi), %ymm0, %ymm4
172	vpmovmskb %ymm4, %eax
173	testl	%eax, %eax
174	jz	L(zero)
175	bsrl	%eax, %eax
176	subq	$(VEC_SIZE * 4), %rdx
177	addq	%rax, %rdx
178	jl	L(zero)
179	addq	%rdi, %rax
180	VZEROUPPER_RETURN
181
182	.p2align 4
183L(last_2x_vec):
184	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
185	vpmovmskb %ymm1, %eax
186	testl	%eax, %eax
187	jnz	L(last_vec_x3_check)
188	cmpl	$VEC_SIZE, %edx
189	jbe	L(zero)
190
191	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
192	vpmovmskb %ymm1, %eax
193	testl	%eax, %eax
194	jz	L(zero)
195	bsrl	%eax, %eax
196	subq	$(VEC_SIZE * 2), %rdx
197	addq	%rax, %rdx
198	jl	L(zero)
199	addl	$(VEC_SIZE * 2), %eax
200	addq	%rdi, %rax
201	VZEROUPPER_RETURN
202
203	.p2align 4
204L(last_vec_x0):
205	bsrl	%eax, %eax
206	addq	%rdi, %rax
207	VZEROUPPER_RETURN
208
209	.p2align 4
210L(last_vec_x1):
211	bsrl	%eax, %eax
212	addl	$VEC_SIZE, %eax
213	addq	%rdi, %rax
214	VZEROUPPER_RETURN
215
216	.p2align 4
217L(last_vec_x2):
218	bsrl	%eax, %eax
219	addl	$(VEC_SIZE * 2), %eax
220	addq	%rdi, %rax
221	VZEROUPPER_RETURN
222
223	.p2align 4
224L(last_vec_x3):
225	bsrl	%eax, %eax
226	addl	$(VEC_SIZE * 3), %eax
227	addq	%rdi, %rax
228	ret
229
230	.p2align 4
231L(last_vec_x1_check):
232	bsrl	%eax, %eax
233	subq	$(VEC_SIZE * 3), %rdx
234	addq	%rax, %rdx
235	jl	L(zero)
236	addl	$VEC_SIZE, %eax
237	addq	%rdi, %rax
238	VZEROUPPER_RETURN
239
240	.p2align 4
241L(last_vec_x3_check):
242	bsrl	%eax, %eax
243	subq	$VEC_SIZE, %rdx
244	addq	%rax, %rdx
245	jl	L(zero)
246	addl	$(VEC_SIZE * 3), %eax
247	addq	%rdi, %rax
248	VZEROUPPER_RETURN
249
250	.p2align 4
251L(zero):
252	xorl	%eax, %eax
253	VZEROUPPER_RETURN
254
255	.p2align 4
256L(null):
257	xorl	%eax, %eax
258	ret
259
260	.p2align 4
261L(last_vec_or_less_aligned):
262	movl	%edx, %ecx
263
264	vpcmpeqb (%rdi), %ymm0, %ymm1
265
266	movl	$1, %edx
267	/* Support rdx << 32.  */
268	salq	%cl, %rdx
269	subq	$1, %rdx
270
271	vpmovmskb %ymm1, %eax
272
273	/* Remove the trailing bytes.  */
274	andl	%edx, %eax
275	testl	%eax, %eax
276	jz	L(zero)
277
278	bsrl	%eax, %eax
279	addq	%rdi, %rax
280	VZEROUPPER_RETURN
281
282	.p2align 4
283L(last_vec_or_less):
284	addl	$VEC_SIZE, %edx
285
286	/* Check for zero length.  */
287	testl	%edx, %edx
288	jz	L(null)
289
290	movl	%edi, %ecx
291	andl	$(VEC_SIZE - 1), %ecx
292	jz	L(last_vec_or_less_aligned)
293
294	movl	%ecx, %esi
295	movl	%ecx, %r8d
296	addl	%edx, %esi
297	andq	$-VEC_SIZE, %rdi
298
299	subl	$VEC_SIZE, %esi
300	ja	L(last_vec_2x_aligned)
301
302	/* Check the last VEC.  */
303	vpcmpeqb (%rdi), %ymm0, %ymm1
304	vpmovmskb %ymm1, %eax
305
306	/* Remove the leading and trailing bytes.  */
307	sarl	%cl, %eax
308	movl	%edx, %ecx
309
310	movl	$1, %edx
311	sall	%cl, %edx
312	subl	$1, %edx
313
314	andl	%edx, %eax
315	testl	%eax, %eax
316	jz	L(zero)
317
318	bsrl	%eax, %eax
319	addq	%rdi, %rax
320	addq	%r8, %rax
321	VZEROUPPER_RETURN
322
323	.p2align 4
324L(last_vec_2x_aligned):
325	movl	%esi, %ecx
326
327	/* Check the last VEC.  */
328	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
329
330	movl	$1, %edx
331	sall	%cl, %edx
332	subl	$1, %edx
333
334	vpmovmskb %ymm1, %eax
335
336	/* Remove the trailing bytes.  */
337	andl	%edx, %eax
338
339	testl	%eax, %eax
340	jnz	L(last_vec_x1)
341
342	/* Check the second last VEC.  */
343	vpcmpeqb (%rdi), %ymm0, %ymm1
344
345	movl	%r8d, %ecx
346
347	vpmovmskb %ymm1, %eax
348
349	/* Remove the leading bytes.  Must use unsigned right shift for
350	   bsrl below.  */
351	shrl	%cl, %eax
352	testl	%eax, %eax
353	jz	L(zero)
354
355	bsrl	%eax, %eax
356	addq	%rdi, %rax
357	addq	%r8, %rax
358	VZEROUPPER_RETURN
359END (MEMRCHR)
360#endif
361