1/* SSE2 version of strlen and SSE4.1 version of wcslen.
2   Copyright (C) 2012-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifdef AS_WCSLEN
22# define PMINU		pminud
23# define PCMPEQ		pcmpeqd
24# define SHIFT_RETURN	shrq $2, %rax
25#else
26# define PMINU		pminub
27# define PCMPEQ		pcmpeqb
28# define SHIFT_RETURN
29#endif
30
31/* Long lived register in strlen(s), strnlen(s, n) are:
32
33	%xmm3 - zero
34	%rdi   - s
35	%r10  (s+n) & (~(64-1))
36	%r11   s+n
37*/
38
39
40.text
41ENTRY(strlen)
42
43/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
44#define FIND_ZERO	\
45	PCMPEQ	(%rax), %xmm0;	\
46	PCMPEQ	16(%rax), %xmm1;	\
47	PCMPEQ	32(%rax), %xmm2;	\
48	PCMPEQ	48(%rax), %xmm3;	\
49	pmovmskb	%xmm0, %esi;	\
50	pmovmskb	%xmm1, %edx;	\
51	pmovmskb	%xmm2, %r8d;	\
52	pmovmskb	%xmm3, %ecx;	\
53	salq	$16, %rdx;	\
54	salq	$16, %rcx;	\
55	orq	%rsi, %rdx;	\
56	orq	%r8, %rcx;	\
57	salq	$32, %rcx;	\
58	orq	%rcx, %rdx;
59
60#ifdef AS_STRNLEN
61/* Do not read anything when n==0.  */
62	test	%RSI_LP, %RSI_LP
63	jne	L(n_nonzero)
64	xor	%rax, %rax
65	ret
66L(n_nonzero):
67# ifdef AS_WCSLEN
68/* Check for overflow from maxlen * sizeof(wchar_t). If it would
69   overflow the only way this program doesn't have undefined behavior
70   is if there is a null terminator in valid memory so wcslen will
71   suffice.  */
72	mov	%RSI_LP, %R10_LP
73	sar	$62, %R10_LP
74	jnz	__wcslen_sse4_1
75	sal	$2, %RSI_LP
76# endif
77
78/* Initialize long lived registers.  */
79	add	%RDI_LP, %RSI_LP
80	mov	%RSI_LP, %R10_LP
81	and	$-64, %R10_LP
82	mov	%RSI_LP, %R11_LP
83#endif
84
85	pxor	%xmm0, %xmm0
86	pxor	%xmm1, %xmm1
87	pxor	%xmm2, %xmm2
88	pxor	%xmm3, %xmm3
89	movq	%rdi, %rax
90	movq	%rdi, %rcx
91	andq	$4095, %rcx
92/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
93	cmpq	$4047, %rcx
94/* We cannot unify this branching as it would be ~6 cycles slower.  */
95	ja	L(cross_page)
96
97#ifdef AS_STRNLEN
98/* Test if end is among first 64 bytes.  */
99# define STRNLEN_PROLOG	\
100	mov	%r11, %rsi;	\
101	subq	%rax, %rsi;	\
102	andq	$-64, %rax;	\
103	testq	$-64, %rsi;	\
104	je	L(strnlen_ret)
105#else
106# define STRNLEN_PROLOG  andq $-64, %rax;
107#endif
108
109/* Ignore bits in mask that come before start of string.  */
110#define PROLOG(lab)	\
111	movq	%rdi, %rcx;	\
112	xorq	%rax, %rcx;	\
113	STRNLEN_PROLOG;	\
114	sarq	%cl, %rdx;	\
115	test	%rdx, %rdx;	\
116	je	L(lab);	\
117	bsfq	%rdx, %rax;	\
118	SHIFT_RETURN;		\
119	ret
120
121#ifdef AS_STRNLEN
122	andq	$-16, %rax
123	FIND_ZERO
124#else
125	/* Test first 16 bytes unaligned.  */
126	movdqu	(%rax), %xmm4
127	PCMPEQ	%xmm0, %xmm4
128	pmovmskb	%xmm4, %edx
129	test	%edx, %edx
130	je 	L(next48_bytes)
131	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
132	SHIFT_RETURN
133	ret
134
135L(next48_bytes):
136/* Same as FIND_ZERO except we do not check first 16 bytes.  */
137	andq	$-16, %rax
138	PCMPEQ 16(%rax), %xmm1
139	PCMPEQ 32(%rax), %xmm2
140	PCMPEQ 48(%rax), %xmm3
141	pmovmskb	%xmm1, %edx
142	pmovmskb	%xmm2, %r8d
143	pmovmskb	%xmm3, %ecx
144	salq	$16, %rdx
145	salq	$16, %rcx
146	orq	%r8, %rcx
147	salq	$32, %rcx
148	orq	%rcx, %rdx
149#endif
150
151	/* When no zero byte is found xmm1-3 are zero so we do not have to
152	   zero them.  */
153	PROLOG(loop)
154
155	.p2align 4
156L(cross_page):
157	andq	$-64, %rax
158	FIND_ZERO
159	PROLOG(loop_init)
160
161#ifdef AS_STRNLEN
162/* We must do this check to correctly handle strnlen (s, -1).  */
163L(strnlen_ret):
164	bts	%rsi, %rdx
165	sarq	%cl, %rdx
166	test	%rdx, %rdx
167	je	L(loop_init)
168	bsfq	%rdx, %rax
169	SHIFT_RETURN
170	ret
171#endif
172	.p2align 4
173L(loop_init):
174	pxor	%xmm1, %xmm1
175	pxor	%xmm2, %xmm2
176	pxor	%xmm3, %xmm3
177#ifdef AS_STRNLEN
178	.p2align 4
179L(loop):
180
181	addq	$64, %rax
182	cmpq	%rax, %r10
183	je	L(exit_end)
184
185	movdqa	(%rax), %xmm0
186	PMINU	16(%rax), %xmm0
187	PMINU	32(%rax), %xmm0
188	PMINU	48(%rax), %xmm0
189	PCMPEQ	%xmm3, %xmm0
190	pmovmskb	%xmm0, %edx
191	testl	%edx, %edx
192	jne	L(exit)
193	jmp	L(loop)
194
195	.p2align 4
196L(exit_end):
197	cmp	%rax, %r11
198	je	L(first) /* Do not read when end is at page boundary.  */
199	pxor	%xmm0, %xmm0
200	FIND_ZERO
201
202L(first):
203	bts	%r11, %rdx
204	bsfq	%rdx, %rdx
205	addq	%rdx, %rax
206	subq	%rdi, %rax
207	SHIFT_RETURN
208	ret
209
210	.p2align 4
211L(exit):
212	pxor	%xmm0, %xmm0
213	FIND_ZERO
214
215	bsfq	%rdx, %rdx
216	addq	%rdx, %rax
217	subq	%rdi, %rax
218	SHIFT_RETURN
219	ret
220
221#else
222
223	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
224	.p2align 4
225L(loop):
226
227	movdqa	64(%rax), %xmm0
228	PMINU	80(%rax), %xmm0
229	PMINU	96(%rax), %xmm0
230	PMINU	112(%rax), %xmm0
231	PCMPEQ	%xmm3, %xmm0
232	pmovmskb	%xmm0, %edx
233	testl	%edx, %edx
234	jne	L(exit64)
235
236	subq	$-128, %rax
237
238	movdqa	(%rax), %xmm0
239	PMINU	16(%rax), %xmm0
240	PMINU	32(%rax), %xmm0
241	PMINU	48(%rax), %xmm0
242	PCMPEQ	%xmm3, %xmm0
243	pmovmskb	%xmm0, %edx
244	testl	%edx, %edx
245	jne	L(exit0)
246	jmp	L(loop)
247
248	.p2align 4
249L(exit64):
250	addq	$64, %rax
251L(exit0):
252	pxor	%xmm0, %xmm0
253	FIND_ZERO
254
255	bsfq	%rdx, %rdx
256	addq	%rdx, %rax
257	subq	%rdi, %rax
258	SHIFT_RETURN
259	ret
260
261#endif
262
263END(strlen)
264