1/* strchr/strchrnul optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24#  define STRCHR	__strchr_evex
25# endif
26
27# define VMOVU		vmovdqu64
28# define VMOVA		vmovdqa64
29
30# ifdef USE_AS_WCSCHR
31#  define VPBROADCAST	vpbroadcastd
32#  define VPCMP		vpcmpd
33#  define VPMINU	vpminud
34#  define CHAR_REG	esi
35#  define SHIFT_REG	ecx
36#  define CHAR_SIZE	4
37# else
38#  define VPBROADCAST	vpbroadcastb
39#  define VPCMP		vpcmpb
40#  define VPMINU	vpminub
41#  define CHAR_REG	sil
42#  define SHIFT_REG	edx
43#  define CHAR_SIZE	1
44# endif
45
46# define XMMZERO	xmm16
47
48# define YMMZERO	ymm16
49# define YMM0		ymm17
50# define YMM1		ymm18
51# define YMM2		ymm19
52# define YMM3		ymm20
53# define YMM4		ymm21
54# define YMM5		ymm22
55# define YMM6		ymm23
56# define YMM7		ymm24
57# define YMM8		ymm25
58
59# define VEC_SIZE 32
60# define PAGE_SIZE 4096
61# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
62
63	.section .text.evex,"ax",@progbits
64ENTRY (STRCHR)
65	/* Broadcast CHAR to YMM0.	*/
66	VPBROADCAST	%esi, %YMM0
67	movl	%edi, %eax
68	andl	$(PAGE_SIZE - 1), %eax
69	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
70
71	/* Check if we cross page boundary with one vector load.
72	   Otherwise it is safe to use an unaligned load.  */
73	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
74	ja	L(cross_page_boundary)
75
76	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
77	   null bytes.  */
78	VMOVU	(%rdi), %YMM1
79
80	/* Leaves only CHARS matching esi as 0.  */
81	vpxorq	%YMM1, %YMM0, %YMM2
82	VPMINU	%YMM2, %YMM1, %YMM2
83	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
84	VPCMP	$0, %YMMZERO, %YMM2, %k0
85	kmovd	%k0, %eax
86	testl	%eax, %eax
87	jz	L(aligned_more)
88	tzcntl	%eax, %eax
89# ifdef USE_AS_WCSCHR
90	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
91	 */
92	leaq	(%rdi, %rax, CHAR_SIZE), %rax
93# else
94	addq	%rdi, %rax
95# endif
96# ifndef USE_AS_STRCHRNUL
97	/* Found CHAR or the null byte.	 */
98	cmp	(%rax), %CHAR_REG
99	jne	L(zero)
100# endif
101	ret
102
103	/* .p2align 5 helps keep performance more consistent if ENTRY()
104	   alignment % 32 was either 16 or 0. As well this makes the
105	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
106	   easier.  */
107	.p2align 5
108L(first_vec_x3):
109	tzcntl	%eax, %eax
110# ifndef USE_AS_STRCHRNUL
111	/* Found CHAR or the null byte.	 */
112	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
113	jne	L(zero)
114# endif
115	/* NB: Multiply sizeof char type (1 or 4) to get the number of
116	   bytes.  */
117	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
118	ret
119
120# ifndef USE_AS_STRCHRNUL
121L(zero):
122	xorl	%eax, %eax
123	ret
124# endif
125
126	.p2align 4
127L(first_vec_x4):
128# ifndef USE_AS_STRCHRNUL
129	/* Check to see if first match was CHAR (k0) or null (k1).  */
130	kmovd	%k0, %eax
131	tzcntl	%eax, %eax
132	kmovd	%k1, %ecx
133	/* bzhil will not be 0 if first match was null.  */
134	bzhil	%eax, %ecx, %ecx
135	jne	L(zero)
136# else
137	/* Combine CHAR and null matches.  */
138	kord	%k0, %k1, %k0
139	kmovd	%k0, %eax
140	tzcntl	%eax, %eax
141# endif
142	/* NB: Multiply sizeof char type (1 or 4) to get the number of
143	   bytes.  */
144	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
145	ret
146
147	.p2align 4
148L(first_vec_x1):
149	tzcntl	%eax, %eax
150# ifndef USE_AS_STRCHRNUL
151	/* Found CHAR or the null byte.	 */
152	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
153	jne	L(zero)
154
155# endif
156	/* NB: Multiply sizeof char type (1 or 4) to get the number of
157	   bytes.  */
158	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
159	ret
160
161	.p2align 4
162L(first_vec_x2):
163# ifndef USE_AS_STRCHRNUL
164	/* Check to see if first match was CHAR (k0) or null (k1).  */
165	kmovd	%k0, %eax
166	tzcntl	%eax, %eax
167	kmovd	%k1, %ecx
168	/* bzhil will not be 0 if first match was null.  */
169	bzhil	%eax, %ecx, %ecx
170	jne	L(zero)
171# else
172	/* Combine CHAR and null matches.  */
173	kord	%k0, %k1, %k0
174	kmovd	%k0, %eax
175	tzcntl	%eax, %eax
176# endif
177	/* NB: Multiply sizeof char type (1 or 4) to get the number of
178	   bytes.  */
179	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
180	ret
181
182	.p2align 4
183L(aligned_more):
184	/* Align data to VEC_SIZE.  */
185	andq	$-VEC_SIZE, %rdi
186L(cross_page_continue):
187	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
188	   data is only aligned to VEC_SIZE. Use two alternating methods
189	   for checking VEC to balance latency and port contention.  */
190
191	/* This method has higher latency but has better port
192	   distribution.  */
193	VMOVA	(VEC_SIZE)(%rdi), %YMM1
194	/* Leaves only CHARS matching esi as 0.  */
195	vpxorq	%YMM1, %YMM0, %YMM2
196	VPMINU	%YMM2, %YMM1, %YMM2
197	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
198	VPCMP	$0, %YMMZERO, %YMM2, %k0
199	kmovd	%k0, %eax
200	testl	%eax, %eax
201	jnz	L(first_vec_x1)
202
203	/* This method has higher latency but has better port
204	   distribution.  */
205	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
206	/* Each bit in K0 represents a CHAR in YMM1.  */
207	VPCMP	$0, %YMM1, %YMM0, %k0
208	/* Each bit in K1 represents a CHAR in YMM1.  */
209	VPCMP	$0, %YMM1, %YMMZERO, %k1
210	kortestd	%k0, %k1
211	jnz	L(first_vec_x2)
212
213	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
214	/* Leaves only CHARS matching esi as 0.  */
215	vpxorq	%YMM1, %YMM0, %YMM2
216	VPMINU	%YMM2, %YMM1, %YMM2
217	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
218	VPCMP	$0, %YMMZERO, %YMM2, %k0
219	kmovd	%k0, %eax
220	testl	%eax, %eax
221	jnz	L(first_vec_x3)
222
223	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
224	/* Each bit in K0 represents a CHAR in YMM1.  */
225	VPCMP	$0, %YMM1, %YMM0, %k0
226	/* Each bit in K1 represents a CHAR in YMM1.  */
227	VPCMP	$0, %YMM1, %YMMZERO, %k1
228	kortestd	%k0, %k1
229	jnz	L(first_vec_x4)
230
231	/* Align data to VEC_SIZE * 4 for the loop.  */
232	addq	$VEC_SIZE, %rdi
233	andq	$-(VEC_SIZE * 4), %rdi
234
235	.p2align 4
236L(loop_4x_vec):
237	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
238	   encoding.  */
239	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
240	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
241	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
242	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
243
244	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
245	   zero.  */
246	vpxorq	%YMM1, %YMM0, %YMM5
247	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
248	   k register. Its possible to save either 1 or 2 instructions
249	   using cmp no equals method for either YMM1 or YMM1 and YMM3
250	   respectively but bottleneck on p5 makes it not worth it.  */
251	VPCMP	$4, %YMM0, %YMM2, %k2
252	vpxorq	%YMM3, %YMM0, %YMM7
253	VPCMP	$4, %YMM0, %YMM4, %k4
254
255	/* Use min to select all zeros from either xor or end of string).
256	 */
257	VPMINU	%YMM1, %YMM5, %YMM1
258	VPMINU	%YMM3, %YMM7, %YMM3
259
260	/* Use min + zeromask to select for zeros. Since k2 and k4 will
261	   have 0 as positions that matched with CHAR which will set
262	   zero in the corresponding destination bytes in YMM2 / YMM4.
263	 */
264	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
265	VPMINU	%YMM3, %YMM4, %YMM4
266	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
267
268	VPCMP	$0, %YMMZERO, %YMM4, %k1
269	kmovd	%k1, %ecx
270	subq	$-(VEC_SIZE * 4), %rdi
271	testl	%ecx, %ecx
272	jz	L(loop_4x_vec)
273
274	VPCMP	$0, %YMMZERO, %YMM1, %k0
275	kmovd	%k0, %eax
276	testl	%eax, %eax
277	jnz	L(last_vec_x1)
278
279	VPCMP	$0, %YMMZERO, %YMM2, %k0
280	kmovd	%k0, %eax
281	testl	%eax, %eax
282	jnz	L(last_vec_x2)
283
284	VPCMP	$0, %YMMZERO, %YMM3, %k0
285	kmovd	%k0, %eax
286	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
287# ifdef USE_AS_WCSCHR
288	sall	$8, %ecx
289	orl	%ecx, %eax
290	tzcntl	%eax, %eax
291# else
292	salq	$32, %rcx
293	orq	%rcx, %rax
294	tzcntq	%rax, %rax
295# endif
296# ifndef USE_AS_STRCHRNUL
297	/* Check if match was CHAR or null.  */
298	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
299	jne	L(zero_end)
300# endif
301	/* NB: Multiply sizeof char type (1 or 4) to get the number of
302	   bytes.  */
303	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
304	ret
305
306# ifndef USE_AS_STRCHRNUL
307L(zero_end):
308	xorl	%eax, %eax
309	ret
310# endif
311
312	.p2align 4
313L(last_vec_x1):
314	tzcntl	%eax, %eax
315# ifndef USE_AS_STRCHRNUL
316	/* Check if match was null.  */
317	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
318	jne	L(zero_end)
319# endif
320	/* NB: Multiply sizeof char type (1 or 4) to get the number of
321	   bytes.  */
322	leaq	(%rdi, %rax, CHAR_SIZE), %rax
323	ret
324
325	.p2align 4
326L(last_vec_x2):
327	tzcntl	%eax, %eax
328# ifndef USE_AS_STRCHRNUL
329	/* Check if match was null.  */
330	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
331	jne	L(zero_end)
332# endif
333	/* NB: Multiply sizeof char type (1 or 4) to get the number of
334	   bytes.  */
335	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
336	ret
337
338	/* Cold case for crossing page with first load.	 */
339	.p2align 4
340L(cross_page_boundary):
341	movq	%rdi, %rdx
342	/* Align rdi.  */
343	andq	$-VEC_SIZE, %rdi
344	VMOVA	(%rdi), %YMM1
345	/* Leaves only CHARS matching esi as 0.  */
346	vpxorq	%YMM1, %YMM0, %YMM2
347	VPMINU	%YMM2, %YMM1, %YMM2
348	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
349	VPCMP	$0, %YMMZERO, %YMM2, %k0
350	kmovd	%k0, %eax
351	/* Remove the leading bits.	 */
352# ifdef USE_AS_WCSCHR
353	movl	%edx, %SHIFT_REG
354	/* NB: Divide shift count by 4 since each bit in K1 represent 4
355	   bytes.  */
356	sarl	$2, %SHIFT_REG
357	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
358# endif
359	sarxl	%SHIFT_REG, %eax, %eax
360	/* If eax is zero continue.  */
361	testl	%eax, %eax
362	jz	L(cross_page_continue)
363	tzcntl	%eax, %eax
364# ifndef USE_AS_STRCHRNUL
365	/* Check to see if match was CHAR or null.  */
366	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
367	jne	L(zero_end)
368# endif
369# ifdef USE_AS_WCSCHR
370	/* NB: Multiply wchar_t count by 4 to get the number of
371	   bytes.  */
372	leaq	(%rdx, %rax, CHAR_SIZE), %rax
373# else
374	addq	%rdx, %rax
375# endif
376	ret
377
378END (STRCHR)
379# endif
380