1/* memcmp with SSE2
2   Copyright (C) 2009-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21	.text
22ENTRY (memcmp)
23#ifdef __ILP32__
24	/* Clear the upper 32 bits.  */
25	movl	%edx, %edx
26#endif
27	test	%RDX_LP, %RDX_LP
28	jz	L(finz)
29	cmpq	$1, %rdx
30	jbe	L(finr1b)
31	subq	%rdi, %rsi
32	movq	%rdx, %r10
33	cmpq	$32, %r10
34	jae	L(gt32)
35	/* Handle small chunks and last block of less than 32 bytes.  */
36L(small):
37	testq	$1, %r10
38	jz	L(s2b)
39	movzbl	(%rdi),	%eax
40	movzbl	(%rdi, %rsi), %edx
41	subq    $1, %r10
42	je	L(finz1)
43	addq	$1, %rdi
44	subl	%edx, %eax
45	jnz	L(exit)
46L(s2b):
47	testq	$2, %r10
48	jz	L(s4b)
49	movzwl	(%rdi),	%eax
50	movzwl	(%rdi, %rsi), %edx
51	subq    $2, %r10
52#ifdef USE_AS_MEMCMPEQ
53	je	L(finz1)
54#else
55	je	L(fin2_7)
56#endif
57	addq	$2, %rdi
58	cmpl	%edx, %eax
59#ifdef USE_AS_MEMCMPEQ
60	jnz	L(neq_early)
61#else
62	jnz	L(fin2_7)
63#endif
64L(s4b):
65	testq	$4, %r10
66	jz	L(s8b)
67	movl	(%rdi),	%eax
68	movl	(%rdi, %rsi), %edx
69	subq    $4, %r10
70#ifdef USE_AS_MEMCMPEQ
71	je	L(finz1)
72#else
73	je	L(fin2_7)
74#endif
75	addq	$4, %rdi
76	cmpl	%edx, %eax
77#ifdef USE_AS_MEMCMPEQ
78	jnz	L(neq_early)
79#else
80	jnz	L(fin2_7)
81#endif
82L(s8b):
83	testq	$8, %r10
84	jz	L(s16b)
85	movq	(%rdi),	%rax
86	movq	(%rdi, %rsi), %rdx
87	subq    $8, %r10
88#ifdef USE_AS_MEMCMPEQ
89	je	L(sub_return8)
90#else
91	je	L(fin2_7)
92#endif
93	addq	$8, %rdi
94	cmpq	%rdx, %rax
95#ifdef USE_AS_MEMCMPEQ
96	jnz	L(neq_early)
97#else
98	jnz	L(fin2_7)
99#endif
100L(s16b):
101	movdqu    (%rdi), %xmm1
102	movdqu    (%rdi, %rsi), %xmm0
103	pcmpeqb   %xmm0, %xmm1
104#ifdef USE_AS_MEMCMPEQ
105	pmovmskb  %xmm1, %eax
106	subl      $0xffff, %eax
107	ret
108#else
109	pmovmskb  %xmm1, %edx
110	xorl	  %eax, %eax
111	subl      $0xffff, %edx
112	jz	  L(finz)
113	bsfl      %edx, %ecx
114	leaq	 (%rdi, %rcx), %rcx
115	movzbl	 (%rcx), %eax
116	movzbl	 (%rsi, %rcx), %edx
117	jmp	 L(finz1)
118#endif
119	.p2align 4,, 4
120L(finr1b):
121	movzbl	(%rdi), %eax
122	movzbl  (%rsi), %edx
123L(finz1):
124	subl	%edx, %eax
125L(exit):
126	ret
127#ifdef USE_AS_MEMCMPEQ
128	.p2align 4,, 4
129L(sub_return8):
130	subq	%rdx, %rax
131	movl	%eax, %edx
132	shrq	$32, %rax
133	orl	%edx, %eax
134	ret
135#else
136	.p2align 4,, 4
137L(fin2_7):
138	cmpq	%rdx, %rax
139	jz	L(finz)
140	movq	%rax, %r11
141	subq	%rdx, %r11
142	bsfq	%r11, %rcx
143	sarq	$3, %rcx
144	salq	$3, %rcx
145	sarq	%cl, %rax
146	movzbl  %al, %eax
147	sarq	%cl, %rdx
148	movzbl  %dl, %edx
149	subl	%edx, %eax
150	ret
151#endif
152	.p2align 4,, 4
153L(finz):
154	xorl	%eax, %eax
155	ret
156#ifdef USE_AS_MEMCMPEQ
157	.p2align 4,, 4
158L(neq_early):
159	movl	$1, %eax
160	ret
161#endif
162	/* For blocks bigger than 32 bytes
163	   1. Advance one of the addr pointer to be 16B aligned.
164	   2. Treat the case of both addr pointers aligned to 16B
165	      separately to avoid movdqu.
166	   3. Handle any blocks of greater than 64 consecutive bytes with
167	      unrolling to reduce branches.
168	   4. At least one addr pointer is 16B aligned, use memory version
169	      of pcmbeqb.
170	*/
171	.p2align 4,, 4
172L(gt32):
173	movq	%rdx, %r11
174	addq	%rdi, %r11
175	movq	%rdi, %r8
176
177	andq	$15, %r8
178	jz	L(16am)
179	/* Both pointers may be misaligned.  */
180	movdqu	(%rdi),	%xmm1
181	movdqu	(%rdi, %rsi), %xmm0
182	pcmpeqb   %xmm0, %xmm1
183	pmovmskb  %xmm1, %edx
184	subl      $0xffff, %edx
185	jnz       L(neq)
186	neg	 %r8
187	leaq    16(%rdi, %r8), %rdi
188L(16am):
189	/* Handle two 16B aligned pointers separately.  */
190	testq   $15, %rsi
191	jz      L(ATR)
192	testq	$16, %rdi
193	jz	L(A32)
194	movdqu	(%rdi, %rsi), %xmm0
195	pcmpeqb   (%rdi), %xmm0
196	pmovmskb  %xmm0, %edx
197	subl      $0xffff, %edx
198	jnz       L(neq)
199	addq	$16, %rdi
200L(A32):
201	movq	%r11, %r10
202	andq	$-32, %r10
203	cmpq	%r10, %rdi
204        jae	L(mt16)
205	/* Pre-unroll to be ready for unrolled 64B loop.  */
206	testq	$32, %rdi
207	jz	L(A64)
208	movdqu    (%rdi,%rsi), %xmm0
209	pcmpeqb   (%rdi), %xmm0
210	pmovmskb  %xmm0, %edx
211	subl      $0xffff, %edx
212	jnz       L(neq)
213	addq       $16, %rdi
214
215	movdqu    (%rdi,%rsi), %xmm0
216	pcmpeqb  (%rdi), %xmm0
217	pmovmskb  %xmm0, %edx
218	subl      $0xffff, %edx
219	jnz       L(neq)
220	addq       $16, %rdi
221
222L(A64):
223	movq	%r11, %r10
224	andq	$-64, %r10
225	cmpq	%r10, %rdi
226        jae	L(mt32)
227
228L(A64main):
229	movdqu    (%rdi,%rsi), %xmm0
230	pcmpeqb   (%rdi), %xmm0
231	pmovmskb  %xmm0, %edx
232	subl      $0xffff, %edx
233	jnz       L(neq)
234	addq       $16, %rdi
235
236	movdqu    (%rdi,%rsi), %xmm0
237	pcmpeqb   (%rdi), %xmm0
238	pmovmskb  %xmm0, %edx
239	subl      $0xffff, %edx
240	jnz       L(neq)
241	addq       $16, %rdi
242
243	movdqu    (%rdi,%rsi), %xmm0
244	pcmpeqb   (%rdi), %xmm0
245	pmovmskb  %xmm0, %edx
246	subl      $0xffff, %edx
247	jnz       L(neq)
248	addq       $16, %rdi
249
250	movdqu    (%rdi,%rsi), %xmm0
251	pcmpeqb  (%rdi), %xmm0
252	pmovmskb  %xmm0, %edx
253	subl      $0xffff, %edx
254	jnz       L(neq)
255	addq       $16, %rdi
256
257	cmpq       %rdi, %r10
258	jne       L(A64main)
259
260L(mt32):
261	movq	%r11, %r10
262	andq	$-32, %r10
263	cmpq	%r10, %rdi
264        jae	L(mt16)
265
266L(A32main):
267	movdqu    (%rdi,%rsi), %xmm0
268	pcmpeqb   (%rdi), %xmm0
269	pmovmskb  %xmm0, %edx
270	subl      $0xffff, %edx
271	jnz       L(neq)
272	addq       $16, %rdi
273
274	movdqu    (%rdi,%rsi), %xmm0
275	pcmpeqb  (%rdi), %xmm0
276	pmovmskb  %xmm0, %edx
277	subl      $0xffff, %edx
278	jnz       L(neq)
279	addq       $16, %rdi
280
281	cmpq       %rdi, %r10
282	jne       L(A32main)
283L(mt16):
284	subq       %rdi, %r11
285	je	  L(finz)
286	movq	  %r11, %r10
287	jmp	  L(small)
288
289	.p2align 4,, 4
290L(neq):
291#ifdef USE_AS_MEMCMPEQ
292	movl	$1, %eax
293    ret
294#else
295	bsfl      %edx, %ecx
296	movzbl	 (%rdi, %rcx), %eax
297	addq	 %rdi, %rsi
298	movzbl	 (%rsi,%rcx), %edx
299	jmp	 L(finz1)
300#endif
301
302	.p2align 4,, 4
303L(ATR):
304	movq	%r11, %r10
305	andq	$-32, %r10
306	cmpq	%r10, %rdi
307        jae	L(mt16)
308	testq	$16, %rdi
309	jz	L(ATR32)
310
311	movdqa    (%rdi,%rsi), %xmm0
312	pcmpeqb   (%rdi), %xmm0
313	pmovmskb  %xmm0, %edx
314	subl      $0xffff, %edx
315	jnz       L(neq)
316	addq       $16, %rdi
317	cmpq       %rdi, %r10
318	je       L(mt16)
319
320L(ATR32):
321	movq	%r11, %r10
322	andq	$-64, %r10
323	testq	$32, %rdi
324	jz	L(ATR64)
325
326	movdqa    (%rdi,%rsi), %xmm0
327	pcmpeqb   (%rdi), %xmm0
328	pmovmskb  %xmm0, %edx
329	subl      $0xffff, %edx
330	jnz       L(neq)
331	addq       $16, %rdi
332
333	movdqa    (%rdi,%rsi), %xmm0
334	pcmpeqb   (%rdi), %xmm0
335	pmovmskb  %xmm0, %edx
336	subl      $0xffff, %edx
337	jnz       L(neq)
338	addq       $16, %rdi
339
340L(ATR64):
341	cmpq       %rdi, %r10
342	je	   L(mt32)
343
344L(ATR64main):
345	movdqa    (%rdi,%rsi), %xmm0
346	pcmpeqb   (%rdi), %xmm0
347	pmovmskb  %xmm0, %edx
348	subl      $0xffff, %edx
349	jnz       L(neq)
350	addq       $16, %rdi
351
352	movdqa    (%rdi,%rsi), %xmm0
353	pcmpeqb   (%rdi), %xmm0
354	pmovmskb  %xmm0, %edx
355	subl      $0xffff, %edx
356	jnz       L(neq)
357	addq       $16, %rdi
358
359	movdqa    (%rdi,%rsi), %xmm0
360	pcmpeqb   (%rdi), %xmm0
361	pmovmskb  %xmm0, %edx
362	subl      $0xffff, %edx
363	jnz       L(neq)
364	addq       $16, %rdi
365
366	movdqa    (%rdi,%rsi), %xmm0
367	pcmpeqb   (%rdi), %xmm0
368	pmovmskb  %xmm0, %edx
369	subl      $0xffff, %edx
370	jnz       L(neq)
371	addq       $16, %rdi
372	cmpq       %rdi, %r10
373	jne       L(ATR64main)
374
375	movq	%r11, %r10
376	andq	$-32, %r10
377	cmpq	%r10, %rdi
378        jae	L(mt16)
379
380L(ATR32res):
381	movdqa    (%rdi,%rsi), %xmm0
382	pcmpeqb   (%rdi), %xmm0
383	pmovmskb  %xmm0, %edx
384	subl      $0xffff, %edx
385	jnz       L(neq)
386	addq       $16, %rdi
387
388	movdqa    (%rdi,%rsi), %xmm0
389	pcmpeqb   (%rdi), %xmm0
390	pmovmskb  %xmm0, %edx
391	subl      $0xffff, %edx
392	jnz       L(neq)
393	addq       $16, %rdi
394
395	cmpq	  %r10, %rdi
396	jne       L(ATR32res)
397
398	subq       %rdi, %r11
399	je	  L(finz)
400	movq	  %r11, %r10
401	jmp	  L(small)
402	/* Align to 16byte to improve instruction fetch.  */
403	.p2align 4,, 4
404END(memcmp)
405
406#ifdef USE_AS_MEMCMPEQ
407libc_hidden_def (memcmp)
408#else
409# undef bcmp
410weak_alias (memcmp, bcmp)
411libc_hidden_builtin_def (memcmp)
412#endif
413