1/* strstr with unaligned loads
2   Copyright (C) 2009-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21ENTRY(__strstr_sse2_unaligned)
22	movzbl	(%rsi), %eax
23	testb	%al, %al
24	je	L(empty)
25	movzbl	1(%rsi), %edx
26	testb	%dl, %dl
27	je	L(strchr)
28	movd	%eax, %xmm1
29	movd	%edx, %xmm2
30	movq	%rdi, %rax
31	andl	$4095, %eax
32	punpcklbw	%xmm1, %xmm1
33	cmpq	$4031, %rax
34	punpcklbw	%xmm2, %xmm2
35	punpcklwd	%xmm1, %xmm1
36	punpcklwd	%xmm2, %xmm2
37	pshufd	$0, %xmm1, %xmm1
38	pshufd	$0, %xmm2, %xmm2
39	ja	L(cross_page)
40	movdqu	(%rdi), %xmm3
41	pxor	%xmm5, %xmm5
42	movdqu	1(%rdi), %xmm4
43	movdqa	%xmm3, %xmm6
44	pcmpeqb	%xmm1, %xmm3
45	pcmpeqb	%xmm2, %xmm4
46	movdqu	16(%rdi), %xmm0
47	pcmpeqb	%xmm5, %xmm6
48	pminub	%xmm4, %xmm3
49	movdqa	%xmm3, %xmm4
50	movdqu	17(%rdi), %xmm3
51	pcmpeqb	%xmm0, %xmm5
52	pcmpeqb	%xmm2, %xmm3
53	por	%xmm6, %xmm4
54	pcmpeqb	%xmm1, %xmm0
55	pminub	%xmm3, %xmm0
56	por	%xmm5, %xmm0
57	pmovmskb	%xmm4, %r8d
58	pmovmskb	%xmm0, %eax
59	salq	$16, %rax
60	orq	%rax, %r8
61	je	L(next_32_bytes)
62L(next_pair_index):
63	bsf	%r8, %rax
64	addq	%rdi, %rax
65	cmpb	$0, (%rax)
66	je	L(zero1)
67	movzbl	2(%rsi), %edx
68	testb	%dl, %dl
69	je	L(found1)
70	cmpb	2(%rax), %dl
71	jne	L(next_pair)
72	xorl	%edx, %edx
73	jmp	L(pair_loop_start)
74
75	.p2align 4
76L(strchr):
77	movzbl	%al, %esi
78	jmp	__strchr_sse2
79
80	.p2align 4
81L(pair_loop):
82	addq	$1, %rdx
83	cmpb	2(%rax,%rdx), %cl
84	jne	L(next_pair)
85L(pair_loop_start):
86	movzbl	3(%rsi,%rdx), %ecx
87	testb	%cl, %cl
88	jne	L(pair_loop)
89L(found1):
90	ret
91L(zero1):
92	xorl	%eax, %eax
93	ret
94
95	.p2align 4
96L(next_pair):
97	leaq	-1(%r8), %rax
98	andq	%rax, %r8
99	jne	L(next_pair_index)
100
101	.p2align 4
102L(next_32_bytes):
103	movdqu	32(%rdi), %xmm3
104	pxor	%xmm5, %xmm5
105	movdqu	33(%rdi), %xmm4
106	movdqa	%xmm3, %xmm6
107	pcmpeqb	%xmm1, %xmm3
108	pcmpeqb	%xmm2, %xmm4
109	movdqu	48(%rdi), %xmm0
110	pcmpeqb	%xmm5, %xmm6
111	pminub	%xmm4, %xmm3
112	movdqa	%xmm3, %xmm4
113	movdqu	49(%rdi), %xmm3
114	pcmpeqb	%xmm0, %xmm5
115	pcmpeqb	%xmm2, %xmm3
116	por	%xmm6, %xmm4
117	pcmpeqb	%xmm1, %xmm0
118	pminub	%xmm3, %xmm0
119	por	%xmm5, %xmm0
120	pmovmskb	%xmm4, %eax
121	salq	$32, %rax
122	pmovmskb	%xmm0, %r8d
123	salq	$48, %r8
124	orq	%rax, %r8
125	je	L(loop_header)
126L(next_pair2_index):
127	bsfq	%r8, %rax
128	addq	%rdi, %rax
129	cmpb	$0, (%rax)
130	je	L(zero2)
131	movzbl	2(%rsi), %edx
132	testb	%dl, %dl
133	je	L(found2)
134	cmpb	2(%rax), %dl
135	jne	L(next_pair2)
136	xorl	%edx, %edx
137	jmp	L(pair_loop2_start)
138
139	.p2align 4
140L(pair_loop2):
141	addq	$1, %rdx
142	cmpb	2(%rax,%rdx), %cl
143	jne	L(next_pair2)
144L(pair_loop2_start):
145	movzbl	3(%rsi,%rdx), %ecx
146	testb	%cl, %cl
147	jne	L(pair_loop2)
148L(found2):
149	ret
150	L(zero2):
151	xorl	%eax, %eax
152	ret
153L(empty):
154	mov %rdi, %rax
155	ret
156
157	.p2align 4
158L(next_pair2):
159	leaq	-1(%r8), %rax
160	andq	%rax, %r8
161	jne	L(next_pair2_index)
162L(loop_header):
163	movq	$-512, %r11
164	movq	%rdi, %r9
165
166	pxor	%xmm7, %xmm7
167	andq	$-64, %rdi
168
169	.p2align 4
170L(loop):
171	movdqa	64(%rdi), %xmm3
172	movdqu	63(%rdi), %xmm6
173	movdqa	%xmm3, %xmm0
174	pxor	%xmm2, %xmm3
175	pxor	%xmm1, %xmm6
176	movdqa	80(%rdi), %xmm10
177	por	%xmm3, %xmm6
178	pminub	%xmm10, %xmm0
179	movdqu	79(%rdi), %xmm3
180	pxor	%xmm2, %xmm10
181	pxor	%xmm1, %xmm3
182	movdqa	96(%rdi), %xmm9
183	por	%xmm10, %xmm3
184	pminub	%xmm9, %xmm0
185	pxor	%xmm2, %xmm9
186	movdqa	112(%rdi), %xmm8
187	addq	$64, %rdi
188	pminub	%xmm6, %xmm3
189	movdqu	31(%rdi), %xmm4
190	pminub	%xmm8, %xmm0
191	pxor	%xmm2, %xmm8
192	pxor	%xmm1, %xmm4
193	por	%xmm9, %xmm4
194	pminub	%xmm4, %xmm3
195	movdqu	47(%rdi), %xmm5
196	pxor	%xmm1, %xmm5
197	por	%xmm8, %xmm5
198	pminub	%xmm5, %xmm3
199	pminub	%xmm3, %xmm0
200	pcmpeqb	%xmm7, %xmm0
201	pmovmskb	%xmm0, %eax
202	testl	%eax, %eax
203	je	L(loop)
204	pminub (%rdi), %xmm6
205	pminub 32(%rdi),%xmm4
206	pminub 48(%rdi),%xmm5
207	pcmpeqb %xmm7, %xmm6
208	pcmpeqb %xmm7, %xmm5
209	pmovmskb	%xmm6, %edx
210	movdqa	16(%rdi), %xmm8
211	pcmpeqb %xmm7, %xmm4
212	movdqu  15(%rdi), %xmm0
213	pmovmskb	%xmm5, %r8d
214	movdqa  %xmm8, %xmm3
215	pmovmskb	%xmm4, %ecx
216	pcmpeqb %xmm1,%xmm0
217	pcmpeqb %xmm2,%xmm3
218	salq	$32, %rcx
219	pcmpeqb %xmm7,%xmm8
220	salq	$48, %r8
221	pminub  %xmm0,%xmm3
222	orq	%rcx, %rdx
223	por	%xmm3,%xmm8
224	orq	%rdx, %r8
225	pmovmskb	%xmm8, %eax
226	salq	$16, %rax
227	orq	%rax, %r8
228	je	L(loop)
229L(next_pair_index3):
230	bsfq	%r8, %rcx
231	addq	%rdi, %rcx
232	cmpb	$0, (%rcx)
233	je	L(zero)
234	xorl	%eax, %eax
235	movzbl	2(%rsi), %edx
236	testb	%dl, %dl
237	je	L(success3)
238	cmpb	1(%rcx), %dl
239	jne	L(next_pair3)
240	jmp	L(pair_loop_start3)
241
242	.p2align 4
243L(pair_loop3):
244	addq	$1, %rax
245	cmpb	1(%rcx,%rax), %dl
246	jne	L(next_pair3)
247L(pair_loop_start3):
248	movzbl	3(%rsi,%rax), %edx
249	testb	%dl, %dl
250	jne	L(pair_loop3)
251L(success3):
252	lea	-1(%rcx), %rax
253	ret
254
255	.p2align 4
256L(next_pair3):
257	addq	%rax, %r11
258	movq	%rdi,  %rax
259	subq	%r9, %rax
260	cmpq	%r11, %rax
261	jl	L(switch_strstr)
262	leaq	-1(%r8), %rax
263	andq	%rax, %r8
264	jne	L(next_pair_index3)
265	jmp	L(loop)
266
267	.p2align 4
268L(switch_strstr):
269	movq	%rdi, %rdi
270	jmp	__strstr_sse2
271
272	.p2align 4
273L(cross_page):
274
275	movq	%rdi, %rax
276	pxor	%xmm0, %xmm0
277	andq	$-64, %rax
278	movdqa	(%rax), %xmm3
279	movdqu	-1(%rax), %xmm4
280	movdqa	%xmm3, %xmm8
281	movdqa	16(%rax), %xmm5
282	pcmpeqb	%xmm1, %xmm4
283	pcmpeqb	%xmm0, %xmm8
284	pcmpeqb	%xmm2, %xmm3
285	movdqa	%xmm5, %xmm7
286	pminub	%xmm4, %xmm3
287	movdqu	15(%rax), %xmm4
288	pcmpeqb	%xmm0, %xmm7
289	por	%xmm3, %xmm8
290	movdqa	%xmm5, %xmm3
291	movdqa	32(%rax), %xmm5
292	pcmpeqb	%xmm1, %xmm4
293	pcmpeqb	%xmm2, %xmm3
294	movdqa	%xmm5, %xmm6
295	pmovmskb	%xmm8, %ecx
296	pminub	%xmm4, %xmm3
297	movdqu	31(%rax), %xmm4
298	por	%xmm3, %xmm7
299	movdqa	%xmm5, %xmm3
300	pcmpeqb	%xmm0, %xmm6
301	movdqa	48(%rax), %xmm5
302	pcmpeqb	%xmm1, %xmm4
303	pmovmskb	%xmm7, %r8d
304	pcmpeqb	%xmm2, %xmm3
305	pcmpeqb	%xmm5, %xmm0
306	pminub	%xmm4, %xmm3
307	movdqu	47(%rax), %xmm4
308	por	%xmm3, %xmm6
309	movdqa	%xmm5, %xmm3
310	salq	$16, %r8
311	pcmpeqb	%xmm1, %xmm4
312	pcmpeqb	%xmm2, %xmm3
313	pmovmskb	%xmm6, %r10d
314	pminub	%xmm4, %xmm3
315	por	%xmm3, %xmm0
316	salq	$32, %r10
317	orq	%r10, %r8
318	orq	%rcx, %r8
319	movl	%edi, %ecx
320	pmovmskb	%xmm0, %edx
321	subl	%eax, %ecx
322	salq	$48, %rdx
323	orq	%rdx, %r8
324	shrq	%cl, %r8
325	je	L(loop_header)
326L(next_pair_index4):
327	bsfq	%r8, %rax
328	addq	%rdi, %rax
329	cmpb	$0, (%rax)
330	je	L(zero)
331
332	cmpq	%rax,%rdi
333	je	L(next_pair4)
334
335	movzbl	2(%rsi), %edx
336	testb	%dl, %dl
337	je	L(found3)
338	cmpb	1(%rax), %dl
339	jne	L(next_pair4)
340	xorl	%edx, %edx
341	jmp	L(pair_loop_start4)
342
343	.p2align 4
344L(pair_loop4):
345	addq	$1, %rdx
346	cmpb	1(%rax,%rdx), %cl
347	jne	L(next_pair4)
348L(pair_loop_start4):
349	movzbl	3(%rsi,%rdx), %ecx
350	testb	%cl, %cl
351	jne	L(pair_loop4)
352L(found3):
353	subq $1, %rax
354	ret
355
356	.p2align 4
357L(next_pair4):
358	leaq	-1(%r8), %rax
359	andq	%rax, %r8
360	jne	L(next_pair_index4)
361	jmp	L(loop_header)
362
363	.p2align 4
364L(found):
365	rep
366	ret
367
368	.p2align 4
369L(zero):
370	xorl	%eax, %eax
371	ret
372
373
374END(__strstr_sse2_unaligned)
375