1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24#  define STRRCHR	__strrchr_evex
25# endif
26
27# define VMOVU		vmovdqu64
28# define VMOVA		vmovdqa64
29
30# ifdef USE_AS_WCSRCHR
31#  define VPBROADCAST	vpbroadcastd
32#  define VPCMP		vpcmpd
33#  define SHIFT_REG	r8d
34# else
35#  define VPBROADCAST	vpbroadcastb
36#  define VPCMP		vpcmpb
37#  define SHIFT_REG	ecx
38# endif
39
40# define XMMZERO	xmm16
41# define YMMZERO	ymm16
42# define YMMMATCH	ymm17
43# define YMM1		ymm18
44
45# define VEC_SIZE	32
46
47	.section .text.evex,"ax",@progbits
48ENTRY (STRRCHR)
49	movl	%edi, %ecx
50	/* Broadcast CHAR to YMMMATCH.  */
51	VPBROADCAST %esi, %YMMMATCH
52
53	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
54
55	/* Check if we may cross page boundary with one vector load.  */
56	andl	$(2 * VEC_SIZE - 1), %ecx
57	cmpl	$VEC_SIZE, %ecx
58	ja	L(cros_page_boundary)
59
60	VMOVU	(%rdi), %YMM1
61
62	/* Each bit in K0 represents a null byte in YMM1.  */
63	VPCMP	$0, %YMMZERO, %YMM1, %k0
64	/* Each bit in K1 represents a CHAR in YMM1.  */
65	VPCMP	$0, %YMMMATCH, %YMM1, %k1
66	kmovd	%k0, %ecx
67	kmovd	%k1, %eax
68
69	addq	$VEC_SIZE, %rdi
70
71	testl	%eax, %eax
72	jnz	L(first_vec)
73
74	testl	%ecx, %ecx
75	jnz	L(return_null)
76
77	andq	$-VEC_SIZE, %rdi
78	xorl	%edx, %edx
79	jmp	L(aligned_loop)
80
81	.p2align 4
82L(first_vec):
83	/* Check if there is a null byte.  */
84	testl	%ecx, %ecx
85	jnz	L(char_and_nul_in_first_vec)
86
87	/* Remember the match and keep searching.  */
88	movl	%eax, %edx
89	movq	%rdi, %rsi
90	andq	$-VEC_SIZE, %rdi
91	jmp	L(aligned_loop)
92
93	.p2align 4
94L(cros_page_boundary):
95	andl	$(VEC_SIZE - 1), %ecx
96	andq	$-VEC_SIZE, %rdi
97
98# ifdef USE_AS_WCSRCHR
99	/* NB: Divide shift count by 4 since each bit in K1 represent 4
100	   bytes.  */
101	movl	%ecx, %SHIFT_REG
102	sarl	$2, %SHIFT_REG
103# endif
104
105	VMOVA	(%rdi), %YMM1
106
107	/* Each bit in K0 represents a null byte in YMM1.  */
108	VPCMP	$0, %YMMZERO, %YMM1, %k0
109	/* Each bit in K1 represents a CHAR in YMM1.  */
110	VPCMP	$0, %YMMMATCH, %YMM1, %k1
111	kmovd	%k0, %edx
112	kmovd	%k1, %eax
113
114	shrxl	%SHIFT_REG, %edx, %edx
115	shrxl	%SHIFT_REG, %eax, %eax
116	addq	$VEC_SIZE, %rdi
117
118	/* Check if there is a CHAR.  */
119	testl	%eax, %eax
120	jnz	L(found_char)
121
122	testl	%edx, %edx
123	jnz	L(return_null)
124
125	jmp	L(aligned_loop)
126
127	.p2align 4
128L(found_char):
129	testl	%edx, %edx
130	jnz	L(char_and_nul)
131
132	/* Remember the match and keep searching.  */
133	movl	%eax, %edx
134	leaq	(%rdi, %rcx), %rsi
135
136	.p2align 4
137L(aligned_loop):
138	VMOVA	(%rdi), %YMM1
139	addq	$VEC_SIZE, %rdi
140
141	/* Each bit in K0 represents a null byte in YMM1.  */
142	VPCMP	$0, %YMMZERO, %YMM1, %k0
143	/* Each bit in K1 represents a CHAR in YMM1.  */
144	VPCMP	$0, %YMMMATCH, %YMM1, %k1
145	kmovd	%k0, %ecx
146	kmovd	%k1, %eax
147	orl	%eax, %ecx
148	jnz	L(char_nor_null)
149
150	VMOVA	(%rdi), %YMM1
151	add	$VEC_SIZE, %rdi
152
153	/* Each bit in K0 represents a null byte in YMM1.  */
154	VPCMP	$0, %YMMZERO, %YMM1, %k0
155	/* Each bit in K1 represents a CHAR in YMM1.  */
156	VPCMP	$0, %YMMMATCH, %YMM1, %k1
157	kmovd	%k0, %ecx
158	kmovd	%k1, %eax
159	orl	%eax, %ecx
160	jnz	L(char_nor_null)
161
162	VMOVA	(%rdi), %YMM1
163	addq	$VEC_SIZE, %rdi
164
165	/* Each bit in K0 represents a null byte in YMM1.  */
166	VPCMP	$0, %YMMZERO, %YMM1, %k0
167	/* Each bit in K1 represents a CHAR in YMM1.  */
168	VPCMP	$0, %YMMMATCH, %YMM1, %k1
169	kmovd	%k0, %ecx
170	kmovd	%k1, %eax
171	orl	%eax, %ecx
172	jnz	L(char_nor_null)
173
174	VMOVA	(%rdi), %YMM1
175	addq	$VEC_SIZE, %rdi
176
177	/* Each bit in K0 represents a null byte in YMM1.  */
178	VPCMP	$0, %YMMZERO, %YMM1, %k0
179	/* Each bit in K1 represents a CHAR in YMM1.  */
180	VPCMP	$0, %YMMMATCH, %YMM1, %k1
181	kmovd	%k0, %ecx
182	kmovd	%k1, %eax
183	orl	%eax, %ecx
184	jz	L(aligned_loop)
185
186	.p2align 4
187L(char_nor_null):
188	/* Find a CHAR or a null byte in a loop.  */
189	testl	%eax, %eax
190	jnz	L(match)
191L(return_value):
192	testl	%edx, %edx
193	jz	L(return_null)
194	movl	%edx, %eax
195	movq	%rsi, %rdi
196	bsrl	%eax, %eax
197# ifdef USE_AS_WCSRCHR
198	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
199	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
200# else
201	leaq	-VEC_SIZE(%rdi, %rax), %rax
202# endif
203	ret
204
205	.p2align 4
206L(match):
207	/* Find a CHAR.  Check if there is a null byte.  */
208	kmovd	%k0, %ecx
209	testl	%ecx, %ecx
210	jnz	L(find_nul)
211
212	/* Remember the match and keep searching.  */
213	movl	%eax, %edx
214	movq	%rdi, %rsi
215	jmp	L(aligned_loop)
216
217	.p2align 4
218L(find_nul):
219	/* Mask out any matching bits after the null byte.  */
220	movl	%ecx, %r8d
221	subl	$1, %r8d
222	xorl	%ecx, %r8d
223	andl	%r8d, %eax
224	testl	%eax, %eax
225	/* If there is no CHAR here, return the remembered one.  */
226	jz	L(return_value)
227	bsrl	%eax, %eax
228# ifdef USE_AS_WCSRCHR
229	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
230	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
231# else
232	leaq	-VEC_SIZE(%rdi, %rax), %rax
233# endif
234	ret
235
236	.p2align 4
237L(char_and_nul):
238	/* Find both a CHAR and a null byte.  */
239	addq	%rcx, %rdi
240	movl	%edx, %ecx
241L(char_and_nul_in_first_vec):
242	/* Mask out any matching bits after the null byte.  */
243	movl	%ecx, %r8d
244	subl	$1, %r8d
245	xorl	%ecx, %r8d
246	andl	%r8d, %eax
247	testl	%eax, %eax
248	/* Return null pointer if the null byte comes first.  */
249	jz	L(return_null)
250	bsrl	%eax, %eax
251# ifdef USE_AS_WCSRCHR
252	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
253	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
254# else
255	leaq	-VEC_SIZE(%rdi, %rax), %rax
256# endif
257	ret
258
259	.p2align 4
260L(return_null):
261	xorl	%eax, %eax
262	ret
263
264END (STRRCHR)
265#endif
266