1/* strrchr/wcsrchr optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24#  define STRRCHR	__strrchr_avx2
25# endif
26
27# ifdef USE_AS_WCSRCHR
28#  define VPBROADCAST	vpbroadcastd
29#  define VPCMPEQ	vpcmpeqd
30# else
31#  define VPBROADCAST	vpbroadcastb
32#  define VPCMPEQ	vpcmpeqb
33# endif
34
35# ifndef VZEROUPPER
36#  define VZEROUPPER	vzeroupper
37# endif
38
39# ifndef SECTION
40#  define SECTION(p)	p##.avx
41# endif
42
43# define VEC_SIZE	32
44
45	.section SECTION(.text),"ax",@progbits
46ENTRY (STRRCHR)
47	movd	%esi, %xmm4
48	movl	%edi, %ecx
49	/* Broadcast CHAR to YMM4.  */
50	VPBROADCAST %xmm4, %ymm4
51	vpxor	%xmm0, %xmm0, %xmm0
52
53	/* Check if we may cross page boundary with one vector load.  */
54	andl	$(2 * VEC_SIZE - 1), %ecx
55	cmpl	$VEC_SIZE, %ecx
56	ja	L(cros_page_boundary)
57
58	vmovdqu	(%rdi), %ymm1
59	VPCMPEQ	%ymm1, %ymm0, %ymm2
60	VPCMPEQ	%ymm1, %ymm4, %ymm3
61	vpmovmskb %ymm2, %ecx
62	vpmovmskb %ymm3, %eax
63	addq	$VEC_SIZE, %rdi
64
65	testl	%eax, %eax
66	jnz	L(first_vec)
67
68	testl	%ecx, %ecx
69	jnz	L(return_null)
70
71	andq	$-VEC_SIZE, %rdi
72	xorl	%edx, %edx
73	jmp	L(aligned_loop)
74
75	.p2align 4
76L(first_vec):
77	/* Check if there is a nul CHAR.  */
78	testl	%ecx, %ecx
79	jnz	L(char_and_nul_in_first_vec)
80
81	/* Remember the match and keep searching.  */
82	movl	%eax, %edx
83	movq	%rdi, %rsi
84	andq	$-VEC_SIZE, %rdi
85	jmp	L(aligned_loop)
86
87	.p2align 4
88L(cros_page_boundary):
89	andl	$(VEC_SIZE - 1), %ecx
90	andq	$-VEC_SIZE, %rdi
91	vmovdqa	(%rdi), %ymm1
92	VPCMPEQ	%ymm1, %ymm0, %ymm2
93	VPCMPEQ	%ymm1, %ymm4, %ymm3
94	vpmovmskb %ymm2, %edx
95	vpmovmskb %ymm3, %eax
96	shrl	%cl, %edx
97	shrl	%cl, %eax
98	addq	$VEC_SIZE, %rdi
99
100	/* Check if there is a CHAR.  */
101	testl	%eax, %eax
102	jnz	L(found_char)
103
104	testl	%edx, %edx
105	jnz	L(return_null)
106
107	jmp	L(aligned_loop)
108
109	.p2align 4
110L(found_char):
111	testl	%edx, %edx
112	jnz	L(char_and_nul)
113
114	/* Remember the match and keep searching.  */
115	movl	%eax, %edx
116	leaq	(%rdi, %rcx), %rsi
117
118	.p2align 4
119L(aligned_loop):
120	vmovdqa	(%rdi), %ymm1
121	VPCMPEQ	%ymm1, %ymm0, %ymm2
122	addq	$VEC_SIZE, %rdi
123	VPCMPEQ	%ymm1, %ymm4, %ymm3
124	vpmovmskb %ymm2, %ecx
125	vpmovmskb %ymm3, %eax
126	orl	%eax, %ecx
127	jnz	L(char_nor_null)
128
129	vmovdqa	(%rdi), %ymm1
130	VPCMPEQ	%ymm1, %ymm0, %ymm2
131	add	$VEC_SIZE, %rdi
132	VPCMPEQ	%ymm1, %ymm4, %ymm3
133	vpmovmskb %ymm2, %ecx
134	vpmovmskb %ymm3, %eax
135	orl	%eax, %ecx
136	jnz	L(char_nor_null)
137
138	vmovdqa	(%rdi), %ymm1
139	VPCMPEQ	%ymm1, %ymm0, %ymm2
140	addq	$VEC_SIZE, %rdi
141	VPCMPEQ	%ymm1, %ymm4, %ymm3
142	vpmovmskb %ymm2, %ecx
143	vpmovmskb %ymm3, %eax
144	orl	%eax, %ecx
145	jnz	L(char_nor_null)
146
147	vmovdqa	(%rdi), %ymm1
148	VPCMPEQ	%ymm1, %ymm0, %ymm2
149	addq	$VEC_SIZE, %rdi
150	VPCMPEQ	%ymm1, %ymm4, %ymm3
151	vpmovmskb %ymm2, %ecx
152	vpmovmskb %ymm3, %eax
153	orl	%eax, %ecx
154	jz	L(aligned_loop)
155
156	.p2align 4
157L(char_nor_null):
158	/* Find a CHAR or a nul CHAR in a loop.  */
159	testl	%eax, %eax
160	jnz	L(match)
161L(return_value):
162	testl	%edx, %edx
163	jz	L(return_null)
164	movl	%edx, %eax
165	movq	%rsi, %rdi
166
167# ifdef USE_AS_WCSRCHR
168	/* Keep the first bit for each matching CHAR for bsr.  */
169	andl	$0x11111111, %eax
170# endif
171	bsrl	%eax, %eax
172	leaq	-VEC_SIZE(%rdi, %rax), %rax
173L(return_vzeroupper):
174	ZERO_UPPER_VEC_REGISTERS_RETURN
175
176	.p2align 4
177L(match):
178	/* Find a CHAR.  Check if there is a nul CHAR.  */
179	vpmovmskb %ymm2, %ecx
180	testl	%ecx, %ecx
181	jnz	L(find_nul)
182
183	/* Remember the match and keep searching.  */
184	movl	%eax, %edx
185	movq	%rdi, %rsi
186	jmp	L(aligned_loop)
187
188	.p2align 4
189L(find_nul):
190# ifdef USE_AS_WCSRCHR
191	/* Keep the first bit for each matching CHAR for bsr.  */
192	andl	$0x11111111, %ecx
193	andl	$0x11111111, %eax
194# endif
195	/* Mask out any matching bits after the nul CHAR.  */
196	movl	%ecx, %r8d
197	subl	$1, %r8d
198	xorl	%ecx, %r8d
199	andl	%r8d, %eax
200	testl	%eax, %eax
201	/* If there is no CHAR here, return the remembered one.  */
202	jz	L(return_value)
203	bsrl	%eax, %eax
204	leaq	-VEC_SIZE(%rdi, %rax), %rax
205	VZEROUPPER_RETURN
206
207	.p2align 4
208L(char_and_nul):
209	/* Find both a CHAR and a nul CHAR.  */
210	addq	%rcx, %rdi
211	movl	%edx, %ecx
212L(char_and_nul_in_first_vec):
213# ifdef USE_AS_WCSRCHR
214	/* Keep the first bit for each matching CHAR for bsr.  */
215	andl	$0x11111111, %ecx
216	andl	$0x11111111, %eax
217# endif
218	/* Mask out any matching bits after the nul CHAR.  */
219	movl	%ecx, %r8d
220	subl	$1, %r8d
221	xorl	%ecx, %r8d
222	andl	%r8d, %eax
223	testl	%eax, %eax
224	/* Return null pointer if the nul CHAR comes first.  */
225	jz	L(return_null)
226	bsrl	%eax, %eax
227	leaq	-VEC_SIZE(%rdi, %rax), %rax
228	VZEROUPPER_RETURN
229
230	.p2align 4
231L(return_null):
232	xorl	%eax, %eax
233	VZEROUPPER_RETURN
234
235END (STRRCHR)
236#endif
237