1/* strcmp with unaligned loads
2   Copyright (C) 2013-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21#include "sysdep.h"
22
23ENTRY ( __strcmp_sse2_unaligned)
24	movl	%edi, %eax
25	xorl	%edx, %edx
26	pxor	%xmm7, %xmm7
27	orl	%esi, %eax
28	andl	$4095, %eax
29	cmpl	$4032, %eax
30	jg	L(cross_page)
31	movdqu	(%rdi), %xmm1
32	movdqu	(%rsi), %xmm0
33	pcmpeqb	%xmm1, %xmm0
34	pminub	%xmm1, %xmm0
35	pxor	%xmm1, %xmm1
36	pcmpeqb	%xmm1, %xmm0
37	pmovmskb	%xmm0, %eax
38	testq	%rax, %rax
39	je	L(next_48_bytes)
40L(return):
41	bsfq	%rax, %rdx
42	movzbl	(%rdi, %rdx), %eax
43	movzbl	(%rsi, %rdx), %edx
44	subl	%edx, %eax
45	ret
46
47	.p2align 4
48L(next_48_bytes):
49	movdqu	16(%rdi), %xmm6
50	movdqu	16(%rsi), %xmm3
51	movdqu	32(%rdi), %xmm5
52	pcmpeqb	%xmm6, %xmm3
53	movdqu	32(%rsi), %xmm2
54	pminub	%xmm6, %xmm3
55	pcmpeqb	%xmm1, %xmm3
56	movdqu	48(%rdi), %xmm4
57	pcmpeqb	%xmm5, %xmm2
58	pmovmskb	%xmm3, %edx
59	movdqu	48(%rsi), %xmm0
60	pminub	%xmm5, %xmm2
61	pcmpeqb	%xmm1, %xmm2
62	pcmpeqb	%xmm4, %xmm0
63	pmovmskb	%xmm2, %eax
64	salq	$16, %rdx
65	pminub	%xmm4, %xmm0
66	pcmpeqb	%xmm1, %xmm0
67	salq	$32, %rax
68	orq	%rdx, %rax
69	pmovmskb	%xmm0, %ecx
70	movq	%rcx, %rdx
71	salq	$48, %rdx
72	orq	%rdx, %rax
73	jne	L(return)
74L(main_loop_header):
75	leaq	64(%rdi), %rdx
76	movl	$4096, %ecx
77	pxor	%xmm9, %xmm9
78	andq	$-64, %rdx
79	subq	%rdi, %rdx
80	leaq	(%rdi, %rdx), %rax
81	addq	%rsi, %rdx
82	movq	%rdx, %rsi
83	andl	$4095, %esi
84	subq	%rsi, %rcx
85	shrq	$6, %rcx
86	movq	%rcx, %rsi
87	jmp	L(loop_start)
88
89	.p2align 4
90L(loop):
91	addq	$64, %rax
92	addq	$64, %rdx
93L(loop_start):
94	testq	%rsi, %rsi
95	leaq	-1(%rsi), %rsi
96	je	L(loop_cross_page)
97L(back_to_loop):
98	movdqu	(%rdx), %xmm0
99	movdqu	16(%rdx), %xmm1
100	movdqa	(%rax), %xmm2
101	movdqa	16(%rax), %xmm3
102	pcmpeqb	%xmm2, %xmm0
103	movdqu	32(%rdx), %xmm5
104	pcmpeqb	%xmm3, %xmm1
105	pminub	%xmm2, %xmm0
106	movdqu	48(%rdx), %xmm6
107	pminub	%xmm3, %xmm1
108	movdqa	32(%rax), %xmm2
109	pminub	%xmm1, %xmm0
110	movdqa	48(%rax), %xmm3
111	pcmpeqb	%xmm2, %xmm5
112	pcmpeqb	%xmm3, %xmm6
113	pminub	%xmm2, %xmm5
114	pminub	%xmm3, %xmm6
115	pminub	%xmm5, %xmm0
116	pminub	%xmm6, %xmm0
117	pcmpeqb	%xmm7, %xmm0
118	pmovmskb	%xmm0, %ecx
119	testl	%ecx, %ecx
120	je	L(loop)
121	pcmpeqb	%xmm7, %xmm5
122	movdqu	(%rdx), %xmm0
123	pcmpeqb	%xmm7, %xmm1
124	movdqa	(%rax), %xmm2
125	pcmpeqb	%xmm2, %xmm0
126	pminub	%xmm2, %xmm0
127	pcmpeqb	%xmm7, %xmm6
128	pcmpeqb	%xmm7, %xmm0
129	pmovmskb	%xmm1, %ecx
130	pmovmskb	%xmm5, %r8d
131	pmovmskb	%xmm0, %edi
132	salq	$16, %rcx
133	salq	$32, %r8
134	pmovmskb	%xmm6, %esi
135	orq	%r8, %rcx
136	orq	%rdi, %rcx
137	salq	$48, %rsi
138	orq	%rsi, %rcx
139	bsfq	%rcx, %rcx
140	movzbl	(%rax, %rcx), %eax
141	movzbl	(%rdx, %rcx), %edx
142	subl	%edx, %eax
143	ret
144
145	.p2align 4
146L(loop_cross_page):
147	xor	%r10, %r10
148	movq	%rdx, %r9
149	and	$63, %r9
150	subq	%r9, %r10
151
152	movdqa	(%rdx, %r10), %xmm0
153	movdqa	16(%rdx, %r10), %xmm1
154	movdqu	(%rax, %r10), %xmm2
155	movdqu	16(%rax, %r10), %xmm3
156	pcmpeqb	%xmm2, %xmm0
157	movdqa	32(%rdx, %r10), %xmm5
158	pcmpeqb	%xmm3, %xmm1
159	pminub	%xmm2, %xmm0
160	movdqa	48(%rdx, %r10), %xmm6
161	pminub	%xmm3, %xmm1
162	movdqu	32(%rax, %r10), %xmm2
163	movdqu	48(%rax, %r10), %xmm3
164	pcmpeqb	%xmm2, %xmm5
165	pcmpeqb	%xmm3, %xmm6
166	pminub	%xmm2, %xmm5
167	pminub	%xmm3, %xmm6
168
169	pcmpeqb	%xmm7, %xmm0
170	pcmpeqb	%xmm7, %xmm1
171	pcmpeqb	%xmm7, %xmm5
172	pcmpeqb	%xmm7, %xmm6
173
174	pmovmskb	%xmm1, %ecx
175	pmovmskb	%xmm5, %r8d
176	pmovmskb	%xmm0, %edi
177	salq	$16, %rcx
178	salq	$32, %r8
179	pmovmskb	%xmm6, %esi
180	orq	%r8, %rdi
181	orq	%rcx, %rdi
182	salq	$48, %rsi
183	orq	%rsi, %rdi
184	movq	%r9, %rcx
185	movq	$63, %rsi
186	shrq	%cl, %rdi
187	test	%rdi, %rdi
188	je	L(back_to_loop)
189	bsfq	%rdi, %rcx
190	movzbl	(%rax, %rcx), %eax
191	movzbl	(%rdx, %rcx), %edx
192	subl	%edx, %eax
193	ret
194
195	.p2align 4
196L(cross_page_loop):
197	cmpb	%cl, %al
198	jne	L(different)
199	addq	$1, %rdx
200	cmpq	$64, %rdx
201	je	L(main_loop_header)
202L(cross_page):
203	movzbl	(%rdi, %rdx), %eax
204	movzbl	(%rsi, %rdx), %ecx
205	testb	%al, %al
206	jne	L(cross_page_loop)
207	xorl	%eax, %eax
208L(different):
209	subl	%ecx, %eax
210	ret
211END (__strcmp_sse2_unaligned)
212
213#endif
214