1/* strchr/strchrnul optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24#  define STRCHR	__strchr_avx2
25# endif
26
27# ifdef USE_AS_WCSCHR
28#  define VPBROADCAST	vpbroadcastd
29#  define VPCMPEQ	vpcmpeqd
30#  define VPMINU	vpminud
31#  define CHAR_REG	esi
32# else
33#  define VPBROADCAST	vpbroadcastb
34#  define VPCMPEQ	vpcmpeqb
35#  define VPMINU	vpminub
36#  define CHAR_REG	sil
37# endif
38
39# ifndef VZEROUPPER
40#  define VZEROUPPER	vzeroupper
41# endif
42
43# ifndef SECTION
44#  define SECTION(p)	p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49
50	.section SECTION(.text),"ax",@progbits
51ENTRY (STRCHR)
52	/* Broadcast CHAR to YMM0.	*/
53	vmovd	%esi, %xmm0
54	movl	%edi, %eax
55	andl	$(PAGE_SIZE - 1), %eax
56	VPBROADCAST	%xmm0, %ymm0
57	vpxor	%xmm9, %xmm9, %xmm9
58
59	/* Check if we cross page boundary with one vector load.  */
60	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
61	ja	L(cross_page_boundary)
62
63	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
64	   null byte.  */
65	vmovdqu	(%rdi), %ymm8
66	VPCMPEQ	%ymm8, %ymm0, %ymm1
67	VPCMPEQ	%ymm8, %ymm9, %ymm2
68	vpor	%ymm1, %ymm2, %ymm1
69	vpmovmskb %ymm1, %eax
70	testl	%eax, %eax
71	jz	L(aligned_more)
72	tzcntl	%eax, %eax
73# ifndef USE_AS_STRCHRNUL
74	/* Found CHAR or the null byte.	 */
75	cmp	(%rdi, %rax), %CHAR_REG
76	jne	L(zero)
77# endif
78	addq	%rdi, %rax
79	VZEROUPPER_RETURN
80
81	/* .p2align 5 helps keep performance more consistent if ENTRY()
82	   alignment % 32 was either 16 or 0. As well this makes the
83	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
84	   easier.  */
85	.p2align 5
86L(first_vec_x4):
87	tzcntl	%eax, %eax
88	addq	$(VEC_SIZE * 3 + 1), %rdi
89# ifndef USE_AS_STRCHRNUL
90	/* Found CHAR or the null byte.	 */
91	cmp	(%rdi, %rax), %CHAR_REG
92	jne	L(zero)
93# endif
94	addq	%rdi, %rax
95	VZEROUPPER_RETURN
96
97# ifndef USE_AS_STRCHRNUL
98L(zero):
99	xorl	%eax, %eax
100	VZEROUPPER_RETURN
101# endif
102
103
104	.p2align 4
105L(first_vec_x1):
106	tzcntl	%eax, %eax
107	incq	%rdi
108# ifndef USE_AS_STRCHRNUL
109	/* Found CHAR or the null byte.	 */
110	cmp	(%rdi, %rax), %CHAR_REG
111	jne	L(zero)
112# endif
113	addq	%rdi, %rax
114	VZEROUPPER_RETURN
115
116	.p2align 4
117L(first_vec_x2):
118	tzcntl	%eax, %eax
119	addq	$(VEC_SIZE + 1), %rdi
120# ifndef USE_AS_STRCHRNUL
121	/* Found CHAR or the null byte.	 */
122	cmp	(%rdi, %rax), %CHAR_REG
123	jne	L(zero)
124# endif
125	addq	%rdi, %rax
126	VZEROUPPER_RETURN
127
128	.p2align 4
129L(first_vec_x3):
130	tzcntl	%eax, %eax
131	addq	$(VEC_SIZE * 2 + 1), %rdi
132# ifndef USE_AS_STRCHRNUL
133	/* Found CHAR or the null byte.	 */
134	cmp	(%rdi, %rax), %CHAR_REG
135	jne	L(zero)
136# endif
137	addq	%rdi, %rax
138	VZEROUPPER_RETURN
139
140	.p2align 4
141L(aligned_more):
142	/* Align data to VEC_SIZE - 1. This is the same number of
143	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
144	   on x4 check.  */
145	orq	$(VEC_SIZE - 1), %rdi
146L(cross_page_continue):
147	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
148	   since data is only aligned to VEC_SIZE.  */
149	vmovdqa	1(%rdi), %ymm8
150	VPCMPEQ	%ymm8, %ymm0, %ymm1
151	VPCMPEQ	%ymm8, %ymm9, %ymm2
152	vpor	%ymm1, %ymm2, %ymm1
153	vpmovmskb %ymm1, %eax
154	testl	%eax, %eax
155	jnz	L(first_vec_x1)
156
157	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
158	VPCMPEQ	%ymm8, %ymm0, %ymm1
159	VPCMPEQ	%ymm8, %ymm9, %ymm2
160	vpor	%ymm1, %ymm2, %ymm1
161	vpmovmskb %ymm1, %eax
162	testl	%eax, %eax
163	jnz	L(first_vec_x2)
164
165	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
166	VPCMPEQ	%ymm8, %ymm0, %ymm1
167	VPCMPEQ	%ymm8, %ymm9, %ymm2
168	vpor	%ymm1, %ymm2, %ymm1
169	vpmovmskb %ymm1, %eax
170	testl	%eax, %eax
171	jnz	L(first_vec_x3)
172
173	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
174	VPCMPEQ	%ymm8, %ymm0, %ymm1
175	VPCMPEQ	%ymm8, %ymm9, %ymm2
176	vpor	%ymm1, %ymm2, %ymm1
177	vpmovmskb %ymm1, %eax
178	testl	%eax, %eax
179	jnz	L(first_vec_x4)
180	/* Align data to VEC_SIZE * 4 - 1.	*/
181	addq	$(VEC_SIZE * 4 + 1), %rdi
182	andq	$-(VEC_SIZE * 4), %rdi
183	.p2align 4
184L(loop_4x_vec):
185	/* Compare 4 * VEC at a time forward.  */
186	vmovdqa	(%rdi), %ymm5
187	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
188	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
189	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
190
191	/* Leaves only CHARS matching esi as 0.	 */
192	vpxor	%ymm5, %ymm0, %ymm1
193	vpxor	%ymm6, %ymm0, %ymm2
194	vpxor	%ymm7, %ymm0, %ymm3
195	vpxor	%ymm8, %ymm0, %ymm4
196
197	VPMINU	%ymm1, %ymm5, %ymm1
198	VPMINU	%ymm2, %ymm6, %ymm2
199	VPMINU	%ymm3, %ymm7, %ymm3
200	VPMINU	%ymm4, %ymm8, %ymm4
201
202	VPMINU	%ymm1, %ymm2, %ymm5
203	VPMINU	%ymm3, %ymm4, %ymm6
204
205	VPMINU	%ymm5, %ymm6, %ymm6
206
207	VPCMPEQ	%ymm6, %ymm9, %ymm6
208	vpmovmskb %ymm6, %ecx
209	subq	$-(VEC_SIZE * 4), %rdi
210	testl	%ecx, %ecx
211	jz	L(loop_4x_vec)
212
213
214	VPCMPEQ	%ymm1, %ymm9, %ymm1
215	vpmovmskb %ymm1, %eax
216	testl	%eax, %eax
217	jnz	L(last_vec_x0)
218
219
220	VPCMPEQ	%ymm5, %ymm9, %ymm2
221	vpmovmskb %ymm2, %eax
222	testl	%eax, %eax
223	jnz	L(last_vec_x1)
224
225	VPCMPEQ	%ymm3, %ymm9, %ymm3
226	vpmovmskb %ymm3, %eax
227	/* rcx has combined result from all 4 VEC. It will only be used
228	   if the first 3 other VEC all did not contain a match.  */
229	salq	$32, %rcx
230	orq	%rcx, %rax
231	tzcntq	%rax, %rax
232	subq	$(VEC_SIZE * 2), %rdi
233# ifndef USE_AS_STRCHRNUL
234	/* Found CHAR or the null byte.	 */
235	cmp	(%rdi, %rax), %CHAR_REG
236	jne	L(zero_end)
237# endif
238	addq	%rdi, %rax
239	VZEROUPPER_RETURN
240
241
242	.p2align 4
243L(last_vec_x0):
244	tzcntl	%eax, %eax
245	addq	$-(VEC_SIZE * 4), %rdi
246# ifndef USE_AS_STRCHRNUL
247	/* Found CHAR or the null byte.	 */
248	cmp	(%rdi, %rax), %CHAR_REG
249	jne	L(zero_end)
250# endif
251	addq	%rdi, %rax
252	VZEROUPPER_RETURN
253
254# ifndef USE_AS_STRCHRNUL
255L(zero_end):
256	xorl	%eax, %eax
257	VZEROUPPER_RETURN
258# endif
259
260	.p2align 4
261L(last_vec_x1):
262	tzcntl	%eax, %eax
263	subq	$(VEC_SIZE * 3), %rdi
264# ifndef USE_AS_STRCHRNUL
265	/* Found CHAR or the null byte.	 */
266	cmp	(%rdi, %rax), %CHAR_REG
267	jne	L(zero_end)
268# endif
269	addq	%rdi, %rax
270	VZEROUPPER_RETURN
271
272
273	/* Cold case for crossing page with first load.	 */
274	.p2align 4
275L(cross_page_boundary):
276	movq	%rdi, %rdx
277	/* Align rdi to VEC_SIZE - 1.  */
278	orq	$(VEC_SIZE - 1), %rdi
279	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
280	VPCMPEQ	%ymm8, %ymm0, %ymm1
281	VPCMPEQ	%ymm8, %ymm9, %ymm2
282	vpor	%ymm1, %ymm2, %ymm1
283	vpmovmskb %ymm1, %eax
284	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
285	   so no need to manually mod edx.  */
286	sarxl	%edx, %eax, %eax
287	testl	%eax, %eax
288	jz	L(cross_page_continue)
289	tzcntl	%eax, %eax
290# ifndef USE_AS_STRCHRNUL
291	xorl	%ecx, %ecx
292	/* Found CHAR or the null byte.	 */
293	cmp	(%rdx, %rax), %CHAR_REG
294	leaq	(%rdx, %rax), %rax
295	cmovne	%rcx, %rax
296# else
297	addq	%rdx, %rax
298# endif
299L(return_vzeroupper):
300	ZERO_UPPER_VEC_REGISTERS_RETURN
301
302END (STRCHR)
303# endif
304