1/* memchr/wmemchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24#  define MEMCHR	__memchr_evex
25# endif
26
27# ifdef USE_AS_WMEMCHR
28#  define VPBROADCAST	vpbroadcastd
29#  define VPMINU	vpminud
30#  define VPCMP	vpcmpd
31#  define VPCMPEQ	vpcmpeqd
32#  define CHAR_SIZE	4
33# else
34#  define VPBROADCAST	vpbroadcastb
35#  define VPMINU	vpminub
36#  define VPCMP	vpcmpb
37#  define VPCMPEQ	vpcmpeqb
38#  define CHAR_SIZE	1
39# endif
40
41	/* In the 4x loop the RTM and non-RTM versions have data pointer
42	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
43	   This is represented by BASE_OFFSET. As well because the RTM
44	   version uses vpcmp which stores a bit per element compared where
45	   the non-RTM version uses vpcmpeq which stores a bit per byte
46	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
47	   version.  */
48# ifdef USE_IN_RTM
49#  define VZEROUPPER
50#  define BASE_OFFSET	(VEC_SIZE * 4)
51#  define RET_SCALE	CHAR_SIZE
52# else
53#  define VZEROUPPER	vzeroupper
54#  define BASE_OFFSET	0
55#  define RET_SCALE	1
56# endif
57
58	/* In the return from 4x loop memchr and rawmemchr versions have
59	   data pointers off by VEC_SIZE * 4 with memchr version being
60	   VEC_SIZE * 4 greater.  */
61# ifdef USE_AS_RAWMEMCHR
62#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
63#  define RAW_PTR_REG	rcx
64#  define ALGN_PTR_REG	rdi
65# else
66#  define RET_OFFSET	BASE_OFFSET
67#  define RAW_PTR_REG	rdi
68#  define ALGN_PTR_REG	rcx
69# endif
70
71# define XMMZERO	xmm23
72# define YMMZERO	ymm23
73# define XMMMATCH	xmm16
74# define YMMMATCH	ymm16
75# define YMM1		ymm17
76# define YMM2		ymm18
77# define YMM3		ymm19
78# define YMM4		ymm20
79# define YMM5		ymm21
80# define YMM6		ymm22
81
82# ifndef SECTION
83#  define SECTION(p)	p##.evex
84# endif
85
86# define VEC_SIZE 32
87# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
88# define PAGE_SIZE 4096
89
90	.section SECTION(.text),"ax",@progbits
91ENTRY (MEMCHR)
92# ifndef USE_AS_RAWMEMCHR
93	/* Check for zero length.  */
94	test	%RDX_LP, %RDX_LP
95	jz	L(zero)
96
97#  ifdef __ILP32__
98	/* Clear the upper 32 bits.  */
99	movl	%edx, %edx
100#  endif
101# endif
102	/* Broadcast CHAR to YMMMATCH.  */
103	VPBROADCAST %esi, %YMMMATCH
104	/* Check if we may cross page boundary with one vector load.  */
105	movl	%edi, %eax
106	andl	$(PAGE_SIZE - 1), %eax
107	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
108	ja	L(cross_page_boundary)
109
110	/* Check the first VEC_SIZE bytes.  */
111	VPCMP	$0, (%rdi), %YMMMATCH, %k0
112	kmovd	%k0, %eax
113# ifndef USE_AS_RAWMEMCHR
114	/* If length < CHAR_PER_VEC handle special.  */
115	cmpq	$CHAR_PER_VEC, %rdx
116	jbe	L(first_vec_x0)
117# endif
118	testl	%eax, %eax
119	jz	L(aligned_more)
120	tzcntl	%eax, %eax
121# ifdef USE_AS_WMEMCHR
122	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
123	leaq	(%rdi, %rax, CHAR_SIZE), %rax
124# else
125	addq	%rdi, %rax
126# endif
127	ret
128
129# ifndef USE_AS_RAWMEMCHR
130L(zero):
131	xorl	%eax, %eax
132	ret
133
134	.p2align 5
135L(first_vec_x0):
136	/* Check if first match was before length.  */
137	tzcntl	%eax, %eax
138	xorl	%ecx, %ecx
139	cmpl	%eax, %edx
140	leaq	(%rdi, %rax, CHAR_SIZE), %rax
141	cmovle	%rcx, %rax
142	ret
143# else
144	/* NB: first_vec_x0 is 17 bytes which will leave
145	   cross_page_boundary (which is relatively cold) close enough
146	   to ideal alignment. So only realign L(cross_page_boundary) if
147	   rawmemchr.  */
148	.p2align 4
149# endif
150L(cross_page_boundary):
151	/* Save pointer before aligning as its original value is
152	   necessary for computer return address if byte is found or
153	   adjusting length if it is not and this is memchr.  */
154	movq	%rdi, %rcx
155	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
156	   for rawmemchr.  */
157	andq	$-VEC_SIZE, %ALGN_PTR_REG
158	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
159	kmovd	%k0, %r8d
160# ifdef USE_AS_WMEMCHR
161	/* NB: Divide shift count by 4 since each bit in K0 represent 4
162	   bytes.  */
163	sarl	$2, %eax
164# endif
165# ifndef USE_AS_RAWMEMCHR
166	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
167	subl	%eax, %esi
168# endif
169# ifdef USE_AS_WMEMCHR
170	andl	$(CHAR_PER_VEC - 1), %eax
171# endif
172	/* Remove the leading bytes.  */
173	sarxl	%eax, %r8d, %eax
174# ifndef USE_AS_RAWMEMCHR
175	/* Check the end of data.  */
176	cmpq	%rsi, %rdx
177	jbe	L(first_vec_x0)
178# endif
179	testl	%eax, %eax
180	jz	L(cross_page_continue)
181	tzcntl	%eax, %eax
182# ifdef USE_AS_WMEMCHR
183	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
184	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
185# else
186	addq	%RAW_PTR_REG, %rax
187# endif
188	ret
189
190	.p2align 4
191L(first_vec_x1):
192	tzcntl	%eax, %eax
193	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
194	ret
195
196	.p2align 4
197L(first_vec_x2):
198	tzcntl	%eax, %eax
199	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
200	ret
201
202	.p2align 4
203L(first_vec_x3):
204	tzcntl	%eax, %eax
205	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
206	ret
207
208	.p2align 4
209L(first_vec_x4):
210	tzcntl	%eax, %eax
211	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
212	ret
213
214	.p2align 5
215L(aligned_more):
216	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
217	   since data is only aligned to VEC_SIZE.  */
218
219# ifndef USE_AS_RAWMEMCHR
220	/* Align data to VEC_SIZE.  */
221L(cross_page_continue):
222	xorl	%ecx, %ecx
223	subl	%edi, %ecx
224	andq	$-VEC_SIZE, %rdi
225	/* esi is for adjusting length to see if near the end.  */
226	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
227#  ifdef USE_AS_WMEMCHR
228	/* NB: Divide bytes by 4 to get the wchar_t count.  */
229	sarl	$2, %esi
230#  endif
231# else
232	andq	$-VEC_SIZE, %rdi
233L(cross_page_continue):
234# endif
235	/* Load first VEC regardless.  */
236	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
237	kmovd	%k0, %eax
238# ifndef USE_AS_RAWMEMCHR
239	/* Adjust length. If near end handle specially.  */
240	subq	%rsi, %rdx
241	jbe	L(last_4x_vec_or_less)
242# endif
243	testl	%eax, %eax
244	jnz	L(first_vec_x1)
245
246	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
247	kmovd	%k0, %eax
248	testl	%eax, %eax
249	jnz	L(first_vec_x2)
250
251	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
252	kmovd	%k0, %eax
253	testl	%eax, %eax
254	jnz	L(first_vec_x3)
255
256	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
257	kmovd	%k0, %eax
258	testl	%eax, %eax
259	jnz	L(first_vec_x4)
260
261
262# ifndef USE_AS_RAWMEMCHR
263	/* Check if at last CHAR_PER_VEC * 4 length.  */
264	subq	$(CHAR_PER_VEC * 4), %rdx
265	jbe	L(last_4x_vec_or_less_cmpeq)
266	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
267	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
268
269	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
270	 */
271#  ifdef USE_AS_WMEMCHR
272	movl	%edi, %ecx
273	andq	$-(4 * VEC_SIZE), %rdi
274	subl	%edi, %ecx
275	/* NB: Divide bytes by 4 to get the wchar_t count.  */
276	sarl	$2, %ecx
277	addq	%rcx, %rdx
278#  else
279	addq	%rdi, %rdx
280	andq	$-(4 * VEC_SIZE), %rdi
281	subq	%rdi, %rdx
282#  endif
283# else
284	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
285	andq	$-(4 * VEC_SIZE), %rdi
286# endif
287# ifdef USE_IN_RTM
288	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
289# else
290	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
291	   encodable with EVEX registers (ymm16-ymm31).  */
292	vmovdqa64 %YMMMATCH, %ymm0
293# endif
294
295	/* Compare 4 * VEC at a time forward.  */
296	.p2align 4
297L(loop_4x_vec):
298	/* Two versions of the loop. One that does not require
299	   vzeroupper by not using ymm0-ymm15 and another does that require
300	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
301	   is used at all is because there is no EVEX encoding vpcmpeq and
302	   with vpcmpeq this loop can be performed more efficiently. The
303	   non-vzeroupper version is safe for RTM while the vzeroupper
304	   version should be prefered if RTM are not supported.  */
305# ifdef USE_IN_RTM
306	/* It would be possible to save some instructions using 4x VPCMP
307	   but bottleneck on port 5 makes it not woth it.  */
308	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
309	/* xor will set bytes match esi to zero.  */
310	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
311	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
312	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
313	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
314	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
315	VPCMP	$0, %YMM3, %YMMZERO, %k2
316# else
317	/* Since vptern can only take 3x vectors fastest to do 1 vec
318	   seperately with EVEX vpcmp.  */
319#  ifdef USE_AS_WMEMCHR
320	/* vptern can only accept masks for epi32/epi64 so can only save
321	   instruction using not equals mask on vptern with wmemchr.  */
322	VPCMP	$4, (%rdi), %YMMMATCH, %k1
323#  else
324	VPCMP	$0, (%rdi), %YMMMATCH, %k1
325#  endif
326	/* Compare 3x with vpcmpeq and or them all together with vptern.
327	 */
328	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
329	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
330	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
331#  ifdef USE_AS_WMEMCHR
332	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
333	   combines result from VEC0 with zero mask.  */
334	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
335	vpmovmskb %ymm4, %ecx
336#  else
337	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
338	vpternlogd $254, %ymm2, %ymm3, %ymm4
339	vpmovmskb %ymm4, %ecx
340	kmovd	%k1, %eax
341#  endif
342# endif
343
344# ifdef USE_AS_RAWMEMCHR
345	subq	$-(VEC_SIZE * 4), %rdi
346# endif
347# ifdef USE_IN_RTM
348	kortestd %k2, %k3
349# else
350#  ifdef USE_AS_WMEMCHR
351	/* ecx contains not of matches. All 1s means no matches. incl will
352	   overflow and set zeroflag if that is the case.  */
353	incl	%ecx
354#  else
355	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
356	   to ecx is not an issue because if eax is non-zero it will be
357	   used for returning the match. If it is zero the add does
358	   nothing.  */
359	addq	%rax, %rcx
360#  endif
361# endif
362# ifdef USE_AS_RAWMEMCHR
363	jz	L(loop_4x_vec)
364# else
365	jnz	L(loop_4x_vec_end)
366
367	subq	$-(VEC_SIZE * 4), %rdi
368
369	subq	$(CHAR_PER_VEC * 4), %rdx
370	ja	L(loop_4x_vec)
371
372	/* Fall through into less than 4 remaining vectors of length case.
373	 */
374	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
375	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
376	kmovd	%k0, %eax
377	VZEROUPPER
378
379L(last_4x_vec_or_less):
380	/* Check if first VEC contained match.  */
381	testl	%eax, %eax
382	jnz	L(first_vec_x1_check)
383
384	/* If remaining length > CHAR_PER_VEC * 2.  */
385	addl	$(CHAR_PER_VEC * 2), %edx
386	jg	L(last_4x_vec)
387
388L(last_2x_vec):
389	/* If remaining length < CHAR_PER_VEC.  */
390	addl	$CHAR_PER_VEC, %edx
391	jle	L(zero_end)
392
393	/* Check VEC2 and compare any match with remaining length.  */
394	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
395	kmovd	%k0, %eax
396	tzcntl	%eax, %eax
397	cmpl	%eax, %edx
398	jbe	L(set_zero_end)
399	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
400L(zero_end):
401	ret
402
403
404	.p2align 4
405L(first_vec_x1_check):
406	tzcntl	%eax, %eax
407	/* Adjust length.  */
408	subl	$-(CHAR_PER_VEC * 4), %edx
409	/* Check if match within remaining length.  */
410	cmpl	%eax, %edx
411	jbe	L(set_zero_end)
412	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
413	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
414	ret
415L(set_zero_end):
416	xorl	%eax, %eax
417	ret
418
419	.p2align 4
420L(loop_4x_vec_end):
421# endif
422	/* rawmemchr will fall through into this if match was found in
423	   loop.  */
424
425# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
426	/* k1 has not of matches with VEC1.  */
427	kmovd	%k1, %eax
428#  ifdef USE_AS_WMEMCHR
429	subl	$((1 << CHAR_PER_VEC) - 1), %eax
430#  else
431	incl	%eax
432#  endif
433# else
434	/* eax already has matches for VEC1.  */
435	testl	%eax, %eax
436# endif
437	jnz	L(last_vec_x1_return)
438
439# ifdef USE_IN_RTM
440	VPCMP	$0, %YMM2, %YMMZERO, %k0
441	kmovd	%k0, %eax
442# else
443	vpmovmskb %ymm2, %eax
444# endif
445	testl	%eax, %eax
446	jnz	L(last_vec_x2_return)
447
448# ifdef USE_IN_RTM
449	kmovd	%k2, %eax
450	testl	%eax, %eax
451	jnz	L(last_vec_x3_return)
452
453	kmovd	%k3, %eax
454	tzcntl	%eax, %eax
455	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
456# else
457	vpmovmskb %ymm3, %eax
458	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
459	salq	$VEC_SIZE, %rcx
460	orq	%rcx, %rax
461	tzcntq	%rax, %rax
462	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
463	VZEROUPPER
464# endif
465	ret
466
467	.p2align 4
468L(last_vec_x1_return):
469	tzcntl	%eax, %eax
470# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
471	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
472	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
473# else
474	addq	%rdi, %rax
475# endif
476	VZEROUPPER
477	ret
478
479	.p2align 4
480L(last_vec_x2_return):
481	tzcntl	%eax, %eax
482	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
483	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
484	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
485	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
486	VZEROUPPER
487	ret
488
489# ifdef USE_IN_RTM
490	.p2align 4
491L(last_vec_x3_return):
492	tzcntl	%eax, %eax
493	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
494	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
495	ret
496# endif
497
498# ifndef USE_AS_RAWMEMCHR
499L(last_4x_vec_or_less_cmpeq):
500	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
501	kmovd	%k0, %eax
502	subq	$-(VEC_SIZE * 4), %rdi
503	/* Check first VEC regardless.  */
504	testl	%eax, %eax
505	jnz	L(first_vec_x1_check)
506
507	/* If remaining length <= CHAR_PER_VEC * 2.  */
508	addl	$(CHAR_PER_VEC * 2), %edx
509	jle	L(last_2x_vec)
510
511	.p2align 4
512L(last_4x_vec):
513	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
514	kmovd	%k0, %eax
515	testl	%eax, %eax
516	jnz	L(last_vec_x2)
517
518
519	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
520	kmovd	%k0, %eax
521	/* Create mask for possible matches within remaining length.  */
522#  ifdef USE_AS_WMEMCHR
523	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
524	bzhil	%edx, %ecx, %ecx
525#  else
526	movq	$-1, %rcx
527	bzhiq	%rdx, %rcx, %rcx
528#  endif
529	/* Test matches in data against length match.  */
530	andl	%ecx, %eax
531	jnz	L(last_vec_x3)
532
533	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
534	   remaining length was found to be > CHAR_PER_VEC * 2.  */
535	subl	$CHAR_PER_VEC, %edx
536	jbe	L(zero_end2)
537
538
539	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
540	kmovd	%k0, %eax
541	/* Shift remaining length mask for last VEC.  */
542#  ifdef USE_AS_WMEMCHR
543	shrl	$CHAR_PER_VEC, %ecx
544#  else
545	shrq	$CHAR_PER_VEC, %rcx
546#  endif
547	andl	%ecx, %eax
548	jz	L(zero_end2)
549	tzcntl	%eax, %eax
550	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
551L(zero_end2):
552	ret
553
554L(last_vec_x2):
555	tzcntl	%eax, %eax
556	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
557	ret
558
559	.p2align 4
560L(last_vec_x3):
561	tzcntl	%eax, %eax
562	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
563	ret
564# endif
565
566END (MEMCHR)
567#endif
568