1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRLEN
24#  define STRLEN	__strlen_evex
25# endif
26
27# define VMOVA		vmovdqa64
28
29# ifdef USE_AS_WCSLEN
30#  define VPCMP		vpcmpd
31#  define VPMINU	vpminud
32#  define SHIFT_REG ecx
33#  define CHAR_SIZE	4
34# else
35#  define VPCMP		vpcmpb
36#  define VPMINU	vpminub
37#  define SHIFT_REG edx
38#  define CHAR_SIZE	1
39# endif
40
41# define XMMZERO	xmm16
42# define YMMZERO	ymm16
43# define YMM1		ymm17
44# define YMM2		ymm18
45# define YMM3		ymm19
46# define YMM4		ymm20
47# define YMM5		ymm21
48# define YMM6		ymm22
49
50# define VEC_SIZE 32
51# define PAGE_SIZE 4096
52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54	.section .text.evex,"ax",@progbits
55ENTRY (STRLEN)
56# ifdef USE_AS_STRNLEN
57	/* Check zero length.  */
58	test	%RSI_LP, %RSI_LP
59	jz	L(zero)
60#  ifdef __ILP32__
61	/* Clear the upper 32 bits.  */
62	movl	%esi, %esi
63#  endif
64	mov	%RSI_LP, %R8_LP
65# endif
66	movl	%edi, %eax
67	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
68	/* Clear high bits from edi. Only keeping bits relevant to page
69	   cross check.  */
70	andl	$(PAGE_SIZE - 1), %eax
71	/* Check if we may cross page boundary with one vector load.  */
72	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
73	ja	L(cross_page_boundary)
74
75	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
76	   null byte.  */
77	VPCMP	$0, (%rdi), %YMMZERO, %k0
78	kmovd	%k0, %eax
79# ifdef USE_AS_STRNLEN
80	/* If length < CHAR_PER_VEC handle special.  */
81	cmpq	$CHAR_PER_VEC, %rsi
82	jbe	L(first_vec_x0)
83# endif
84	testl	%eax, %eax
85	jz	L(aligned_more)
86	tzcntl	%eax, %eax
87	ret
88# ifdef USE_AS_STRNLEN
89L(zero):
90	xorl	%eax, %eax
91	ret
92
93	.p2align 4
94L(first_vec_x0):
95	/* Set bit for max len so that tzcnt will return min of max len
96	   and position of first match.  */
97	btsq	%rsi, %rax
98	tzcntl	%eax, %eax
99	ret
100# endif
101
102	.p2align 4
103L(first_vec_x1):
104	tzcntl	%eax, %eax
105	/* Safe to use 32 bit instructions as these are only called for
106	   size = [1, 159].  */
107# ifdef USE_AS_STRNLEN
108	/* Use ecx which was computed earlier to compute correct value.
109	 */
110	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
111# else
112	subl	%edx, %edi
113#  ifdef USE_AS_WCSLEN
114	/* NB: Divide bytes by 4 to get the wchar_t count.  */
115	sarl	$2, %edi
116#  endif
117	leal	CHAR_PER_VEC(%rdi, %rax), %eax
118# endif
119	ret
120
121	.p2align 4
122L(first_vec_x2):
123	tzcntl	%eax, %eax
124	/* Safe to use 32 bit instructions as these are only called for
125	   size = [1, 159].  */
126# ifdef USE_AS_STRNLEN
127	/* Use ecx which was computed earlier to compute correct value.
128	 */
129	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
130# else
131	subl	%edx, %edi
132#  ifdef USE_AS_WCSLEN
133	/* NB: Divide bytes by 4 to get the wchar_t count.  */
134	sarl	$2, %edi
135#  endif
136	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
137# endif
138	ret
139
140	.p2align 4
141L(first_vec_x3):
142	tzcntl	%eax, %eax
143	/* Safe to use 32 bit instructions as these are only called for
144	   size = [1, 159].  */
145# ifdef USE_AS_STRNLEN
146	/* Use ecx which was computed earlier to compute correct value.
147	 */
148	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
149# else
150	subl	%edx, %edi
151#  ifdef USE_AS_WCSLEN
152	/* NB: Divide bytes by 4 to get the wchar_t count.  */
153	sarl	$2, %edi
154#  endif
155	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
156# endif
157	ret
158
159	.p2align 4
160L(first_vec_x4):
161	tzcntl	%eax, %eax
162	/* Safe to use 32 bit instructions as these are only called for
163	   size = [1, 159].  */
164# ifdef USE_AS_STRNLEN
165	/* Use ecx which was computed earlier to compute correct value.
166	 */
167	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
168# else
169	subl	%edx, %edi
170#  ifdef USE_AS_WCSLEN
171	/* NB: Divide bytes by 4 to get the wchar_t count.  */
172	sarl	$2, %edi
173#  endif
174	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
175# endif
176	ret
177
178	.p2align 5
179L(aligned_more):
180	movq	%rdi, %rdx
181	/* Align data to VEC_SIZE.  */
182	andq	$-(VEC_SIZE), %rdi
183L(cross_page_continue):
184	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
185	   since data is only aligned to VEC_SIZE.  */
186# ifdef USE_AS_STRNLEN
187	/* + CHAR_SIZE because it simplies the logic in
188	   last_4x_vec_or_less.  */
189	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
190	subq	%rdx, %rcx
191#  ifdef USE_AS_WCSLEN
192	/* NB: Divide bytes by 4 to get the wchar_t count.  */
193	sarl	$2, %ecx
194#  endif
195# endif
196	/* Load first VEC regardless.  */
197	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
198# ifdef USE_AS_STRNLEN
199	/* Adjust length. If near end handle specially.  */
200	subq	%rcx, %rsi
201	jb	L(last_4x_vec_or_less)
202# endif
203	kmovd	%k0, %eax
204	testl	%eax, %eax
205	jnz	L(first_vec_x1)
206
207	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
208	kmovd	%k0, %eax
209	test	%eax, %eax
210	jnz	L(first_vec_x2)
211
212	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
213	kmovd	%k0, %eax
214	testl	%eax, %eax
215	jnz	L(first_vec_x3)
216
217	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
218	kmovd	%k0, %eax
219	testl	%eax, %eax
220	jnz	L(first_vec_x4)
221
222	addq	$VEC_SIZE, %rdi
223# ifdef USE_AS_STRNLEN
224	/* Check if at last VEC_SIZE * 4 length.  */
225	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
226	jbe	L(last_4x_vec_or_less_load)
227	movl	%edi, %ecx
228	andl	$(VEC_SIZE * 4 - 1), %ecx
229#  ifdef USE_AS_WCSLEN
230	/* NB: Divide bytes by 4 to get the wchar_t count.  */
231	sarl	$2, %ecx
232#  endif
233	/* Readjust length.  */
234	addq	%rcx, %rsi
235# endif
236	/* Align data to VEC_SIZE * 4.  */
237	andq	$-(VEC_SIZE * 4), %rdi
238
239	/* Compare 4 * VEC at a time forward.  */
240	.p2align 4
241L(loop_4x_vec):
242	/* Load first VEC regardless.  */
243	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
244# ifdef USE_AS_STRNLEN
245	/* Break if at end of length.  */
246	subq	$(CHAR_PER_VEC * 4), %rsi
247	jb	L(last_4x_vec_or_less_cmpeq)
248# endif
249	/* Save some code size by microfusing VPMINU with the load. Since
250	   the matches in ymm2/ymm4 can only be returned if there where no
251	   matches in ymm1/ymm3 respectively there is no issue with overlap.
252	 */
253	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
254	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
255	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
256
257	VPCMP	$0, %YMM2, %YMMZERO, %k0
258	VPCMP	$0, %YMM4, %YMMZERO, %k1
259	subq	$-(VEC_SIZE * 4), %rdi
260	kortestd	%k0, %k1
261	jz	L(loop_4x_vec)
262
263	/* Check if end was in first half.  */
264	kmovd	%k0, %eax
265	subq	%rdx, %rdi
266# ifdef USE_AS_WCSLEN
267	shrq	$2, %rdi
268# endif
269	testl	%eax, %eax
270	jz	L(second_vec_return)
271
272	VPCMP	$0, %YMM1, %YMMZERO, %k2
273	kmovd	%k2, %edx
274	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
275# ifdef USE_AS_WCSLEN
276	sall	$CHAR_PER_VEC, %eax
277	orl	%edx, %eax
278	tzcntl	%eax, %eax
279# else
280	salq	$CHAR_PER_VEC, %rax
281	orq	%rdx, %rax
282	tzcntq	%rax, %rax
283# endif
284	addq	%rdi, %rax
285	ret
286
287
288# ifdef USE_AS_STRNLEN
289
290L(last_4x_vec_or_less_load):
291	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
292	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
293L(last_4x_vec_or_less_cmpeq):
294	VPCMP	$0, %YMM1, %YMMZERO, %k0
295	addq	$(VEC_SIZE * 3), %rdi
296L(last_4x_vec_or_less):
297	kmovd	%k0, %eax
298	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
299	   VEC_SIZE * 4.  */
300	testl	$(CHAR_PER_VEC * 2), %esi
301	jnz	L(last_4x_vec)
302
303	/* length may have been negative or positive by an offset of
304	   CHAR_PER_VEC * 4 depending on where this was called from. This
305	   fixes that.  */
306	andl	$(CHAR_PER_VEC * 4 - 1), %esi
307	testl	%eax, %eax
308	jnz	L(last_vec_x1_check)
309
310	/* Check the end of data.  */
311	subl	$CHAR_PER_VEC, %esi
312	jb	L(max)
313
314	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
315	kmovd	%k0, %eax
316	tzcntl	%eax, %eax
317	/* Check the end of data.  */
318	cmpl	%eax, %esi
319	jb	L(max)
320
321	subq	%rdx, %rdi
322#  ifdef USE_AS_WCSLEN
323	/* NB: Divide bytes by 4 to get the wchar_t count.  */
324	sarq	$2, %rdi
325#  endif
326	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
327	ret
328L(max):
329	movq	%r8, %rax
330	ret
331# endif
332
333	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
334	   in the 4x VEC loop can use 2 byte encoding.  */
335	.p2align 4
336L(second_vec_return):
337	VPCMP	$0, %YMM3, %YMMZERO, %k0
338	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
339# ifdef USE_AS_WCSLEN
340	kunpckbw	%k0, %k1, %k0
341	kmovd	%k0, %eax
342	tzcntl	%eax, %eax
343# else
344	kunpckdq	%k0, %k1, %k0
345	kmovq	%k0, %rax
346	tzcntq	%rax, %rax
347# endif
348	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
349	ret
350
351
352# ifdef USE_AS_STRNLEN
353L(last_vec_x1_check):
354	tzcntl	%eax, %eax
355	/* Check the end of data.  */
356	cmpl	%eax, %esi
357	jb	L(max)
358	subq	%rdx, %rdi
359#  ifdef USE_AS_WCSLEN
360	/* NB: Divide bytes by 4 to get the wchar_t count.  */
361	sarq	$2, %rdi
362#  endif
363	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
364	ret
365
366	.p2align 4
367L(last_4x_vec):
368	/* Test first 2x VEC normally.  */
369	testl	%eax, %eax
370	jnz	L(last_vec_x1)
371
372	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
373	kmovd	%k0, %eax
374	testl	%eax, %eax
375	jnz	L(last_vec_x2)
376
377	/* Normalize length.  */
378	andl	$(CHAR_PER_VEC * 4 - 1), %esi
379	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
380	kmovd	%k0, %eax
381	testl	%eax, %eax
382	jnz	L(last_vec_x3)
383
384	/* Check the end of data.  */
385	subl	$(CHAR_PER_VEC * 3), %esi
386	jb	L(max)
387
388	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
389	kmovd	%k0, %eax
390	tzcntl	%eax, %eax
391	/* Check the end of data.  */
392	cmpl	%eax, %esi
393	jb	L(max_end)
394
395	subq	%rdx, %rdi
396#  ifdef USE_AS_WCSLEN
397	/* NB: Divide bytes by 4 to get the wchar_t count.  */
398	sarq	$2, %rdi
399#  endif
400	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
401	ret
402
403	.p2align 4
404L(last_vec_x1):
405	tzcntl	%eax, %eax
406	subq	%rdx, %rdi
407#  ifdef USE_AS_WCSLEN
408	/* NB: Divide bytes by 4 to get the wchar_t count.  */
409	sarq	$2, %rdi
410#  endif
411	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
412	ret
413
414	.p2align 4
415L(last_vec_x2):
416	tzcntl	%eax, %eax
417	subq	%rdx, %rdi
418#  ifdef USE_AS_WCSLEN
419	/* NB: Divide bytes by 4 to get the wchar_t count.  */
420	sarq	$2, %rdi
421#  endif
422	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
423	ret
424
425	.p2align 4
426L(last_vec_x3):
427	tzcntl	%eax, %eax
428	subl	$(CHAR_PER_VEC * 2), %esi
429	/* Check the end of data.  */
430	cmpl	%eax, %esi
431	jb	L(max_end)
432	subq	%rdx, %rdi
433#  ifdef USE_AS_WCSLEN
434	/* NB: Divide bytes by 4 to get the wchar_t count.  */
435	sarq	$2, %rdi
436#  endif
437	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
438	ret
439L(max_end):
440	movq	%r8, %rax
441	ret
442# endif
443
444	/* Cold case for crossing page with first load.	 */
445	.p2align 4
446L(cross_page_boundary):
447	movq	%rdi, %rdx
448	/* Align data to VEC_SIZE.  */
449	andq	$-VEC_SIZE, %rdi
450	VPCMP	$0, (%rdi), %YMMZERO, %k0
451	kmovd	%k0, %eax
452	/* Remove the leading bytes.  */
453# ifdef USE_AS_WCSLEN
454	/* NB: Divide shift count by 4 since each bit in K0 represent 4
455	   bytes.  */
456	movl	%edx, %ecx
457	shrl	$2, %ecx
458	andl	$(CHAR_PER_VEC - 1), %ecx
459# endif
460	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
461	sarxl	%SHIFT_REG, %eax, %eax
462	testl	%eax, %eax
463# ifndef USE_AS_STRNLEN
464	jz	L(cross_page_continue)
465	tzcntl	%eax, %eax
466	ret
467# else
468	jnz	L(cross_page_less_vec)
469#  ifndef USE_AS_WCSLEN
470	movl	%edx, %ecx
471	andl	$(CHAR_PER_VEC - 1), %ecx
472#  endif
473	movl	$CHAR_PER_VEC, %eax
474	subl	%ecx, %eax
475	/* Check the end of data.  */
476	cmpq	%rax, %rsi
477	ja	L(cross_page_continue)
478	movl	%esi, %eax
479	ret
480L(cross_page_less_vec):
481	tzcntl	%eax, %eax
482	/* Select min of length and position of first null.  */
483	cmpq	%rax, %rsi
484	cmovb	%esi, %eax
485	ret
486# endif
487
488END (STRLEN)
489#endif
490