1/* memrchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define VMOVA		vmovdqa64
24
25# define YMMMATCH	ymm16
26
27# define VEC_SIZE 32
28
29	.section .text.evex,"ax",@progbits
30ENTRY (__memrchr_evex)
31	/* Broadcast CHAR to YMMMATCH.  */
32	vpbroadcastb %esi, %YMMMATCH
33
34	sub	$VEC_SIZE, %RDX_LP
35	jbe	L(last_vec_or_less)
36
37	add	%RDX_LP, %RDI_LP
38
39	/* Check the last VEC_SIZE bytes.  */
40	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
41	kmovd	%k1, %eax
42	testl	%eax, %eax
43	jnz	L(last_vec_x0)
44
45	subq	$(VEC_SIZE * 4), %rdi
46	movl	%edi, %ecx
47	andl	$(VEC_SIZE - 1), %ecx
48	jz	L(aligned_more)
49
50	/* Align data for aligned loads in the loop.  */
51	addq	$VEC_SIZE, %rdi
52	addq	$VEC_SIZE, %rdx
53	andq	$-VEC_SIZE, %rdi
54	subq	%rcx, %rdx
55
56	.p2align 4
57L(aligned_more):
58	subq	$(VEC_SIZE * 4), %rdx
59	jbe	L(last_4x_vec_or_less)
60
61	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
62	   since data is only aligned to VEC_SIZE.  */
63	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
64	kmovd	%k1, %eax
65	testl	%eax, %eax
66	jnz	L(last_vec_x3)
67
68	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
69	kmovd	%k2, %eax
70	testl	%eax, %eax
71	jnz	L(last_vec_x2)
72
73	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
74	kmovd	%k3, %eax
75	testl	%eax, %eax
76	jnz	L(last_vec_x1)
77
78	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
79	kmovd	%k4, %eax
80	testl	%eax, %eax
81	jnz	L(last_vec_x0)
82
83	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
84	   There are some overlaps with above if data isn't aligned
85	   to 4 * VEC_SIZE.  */
86	movl	%edi, %ecx
87	andl	$(VEC_SIZE * 4 - 1), %ecx
88	jz	L(loop_4x_vec)
89
90	addq	$(VEC_SIZE * 4), %rdi
91	addq	$(VEC_SIZE * 4), %rdx
92	andq	$-(VEC_SIZE * 4), %rdi
93	subq	%rcx, %rdx
94
95	.p2align 4
96L(loop_4x_vec):
97	/* Compare 4 * VEC at a time forward.  */
98	subq	$(VEC_SIZE * 4), %rdi
99	subq	$(VEC_SIZE * 4), %rdx
100	jbe	L(last_4x_vec_or_less)
101
102	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
103	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
104	kord	%k1, %k2, %k5
105	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
106	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
107
108	kord	%k3, %k4, %k6
109	kortestd %k5, %k6
110	jz	L(loop_4x_vec)
111
112	/* There is a match.  */
113	kmovd	%k4, %eax
114	testl	%eax, %eax
115	jnz	L(last_vec_x3)
116
117	kmovd	%k3, %eax
118	testl	%eax, %eax
119	jnz	L(last_vec_x2)
120
121	kmovd	%k2, %eax
122	testl	%eax, %eax
123	jnz	L(last_vec_x1)
124
125	kmovd	%k1, %eax
126	bsrl	%eax, %eax
127	addq	%rdi, %rax
128	ret
129
130	.p2align 4
131L(last_4x_vec_or_less):
132	addl	$(VEC_SIZE * 4), %edx
133	cmpl	$(VEC_SIZE * 2), %edx
134	jbe	L(last_2x_vec)
135
136	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
137	kmovd	%k1, %eax
138	testl	%eax, %eax
139	jnz	L(last_vec_x3)
140
141	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
142	kmovd	%k2, %eax
143	testl	%eax, %eax
144	jnz	L(last_vec_x2)
145
146	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
147	kmovd	%k3, %eax
148	testl	%eax, %eax
149	jnz	L(last_vec_x1_check)
150	cmpl	$(VEC_SIZE * 3), %edx
151	jbe	L(zero)
152
153	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
154	kmovd	%k4, %eax
155	testl	%eax, %eax
156	jz	L(zero)
157	bsrl	%eax, %eax
158	subq	$(VEC_SIZE * 4), %rdx
159	addq	%rax, %rdx
160	jl	L(zero)
161	addq	%rdi, %rax
162	ret
163
164	.p2align 4
165L(last_2x_vec):
166	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
167	kmovd	%k1, %eax
168	testl	%eax, %eax
169	jnz	L(last_vec_x3_check)
170	cmpl	$VEC_SIZE, %edx
171	jbe	L(zero)
172
173	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
174	kmovd	%k1, %eax
175	testl	%eax, %eax
176	jz	L(zero)
177	bsrl	%eax, %eax
178	subq	$(VEC_SIZE * 2), %rdx
179	addq	%rax, %rdx
180	jl	L(zero)
181	addl	$(VEC_SIZE * 2), %eax
182	addq	%rdi, %rax
183	ret
184
185	.p2align 4
186L(last_vec_x0):
187	bsrl	%eax, %eax
188	addq	%rdi, %rax
189	ret
190
191	.p2align 4
192L(last_vec_x1):
193	bsrl	%eax, %eax
194	addl	$VEC_SIZE, %eax
195	addq	%rdi, %rax
196	ret
197
198	.p2align 4
199L(last_vec_x2):
200	bsrl	%eax, %eax
201	addl	$(VEC_SIZE * 2), %eax
202	addq	%rdi, %rax
203	ret
204
205	.p2align 4
206L(last_vec_x3):
207	bsrl	%eax, %eax
208	addl	$(VEC_SIZE * 3), %eax
209	addq	%rdi, %rax
210	ret
211
212	.p2align 4
213L(last_vec_x1_check):
214	bsrl	%eax, %eax
215	subq	$(VEC_SIZE * 3), %rdx
216	addq	%rax, %rdx
217	jl	L(zero)
218	addl	$VEC_SIZE, %eax
219	addq	%rdi, %rax
220	ret
221
222	.p2align 4
223L(last_vec_x3_check):
224	bsrl	%eax, %eax
225	subq	$VEC_SIZE, %rdx
226	addq	%rax, %rdx
227	jl	L(zero)
228	addl	$(VEC_SIZE * 3), %eax
229	addq	%rdi, %rax
230	ret
231
232	.p2align 4
233L(zero):
234	xorl	%eax, %eax
235	ret
236
237	.p2align 4
238L(last_vec_or_less_aligned):
239	movl	%edx, %ecx
240
241	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
242
243	movl	$1, %edx
244	/* Support rdx << 32.  */
245	salq	%cl, %rdx
246	subq	$1, %rdx
247
248	kmovd	%k1, %eax
249
250	/* Remove the trailing bytes.  */
251	andl	%edx, %eax
252	testl	%eax, %eax
253	jz	L(zero)
254
255	bsrl	%eax, %eax
256	addq	%rdi, %rax
257	ret
258
259	.p2align 4
260L(last_vec_or_less):
261	addl	$VEC_SIZE, %edx
262
263	/* Check for zero length.  */
264	testl	%edx, %edx
265	jz	L(zero)
266
267	movl	%edi, %ecx
268	andl	$(VEC_SIZE - 1), %ecx
269	jz	L(last_vec_or_less_aligned)
270
271	movl	%ecx, %esi
272	movl	%ecx, %r8d
273	addl	%edx, %esi
274	andq	$-VEC_SIZE, %rdi
275
276	subl	$VEC_SIZE, %esi
277	ja	L(last_vec_2x_aligned)
278
279	/* Check the last VEC.  */
280	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
281	kmovd	%k1, %eax
282
283	/* Remove the leading and trailing bytes.  */
284	sarl	%cl, %eax
285	movl	%edx, %ecx
286
287	movl	$1, %edx
288	sall	%cl, %edx
289	subl	$1, %edx
290
291	andl	%edx, %eax
292	testl	%eax, %eax
293	jz	L(zero)
294
295	bsrl	%eax, %eax
296	addq	%rdi, %rax
297	addq	%r8, %rax
298	ret
299
300	.p2align 4
301L(last_vec_2x_aligned):
302	movl	%esi, %ecx
303
304	/* Check the last VEC.  */
305	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
306
307	movl	$1, %edx
308	sall	%cl, %edx
309	subl	$1, %edx
310
311	kmovd	%k1, %eax
312
313	/* Remove the trailing bytes.  */
314	andl	%edx, %eax
315
316	testl	%eax, %eax
317	jnz	L(last_vec_x1)
318
319	/* Check the second last VEC.  */
320	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
321
322	movl	%r8d, %ecx
323
324	kmovd	%k1, %eax
325
326	/* Remove the leading bytes.  Must use unsigned right shift for
327	   bsrl below.  */
328	shrl	%cl, %eax
329	testl	%eax, %eax
330	jz	L(zero)
331
332	bsrl	%eax, %eax
333	addq	%rdi, %rax
334	addq	%r8, %rax
335	ret
336END (__memrchr_evex)
337#endif
338