1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2   Copyright (C) 2018-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24#  define STRCMP	__strcmp_avx2
25# endif
26
27# define PAGE_SIZE	4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE	32
31
32/* Shift for dividing by (VEC_SIZE * 4).  */
33# define DIVIDE_BY_VEC_4_SHIFT	7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# ifdef USE_AS_WCSCMP
39/* Compare packed dwords.  */
40#  define VPCMPEQ	vpcmpeqd
41/* Compare packed dwords and store minimum.  */
42#  define VPMINU	vpminud
43/* 1 dword char == 4 bytes.  */
44#  define SIZE_OF_CHAR	4
45# else
46/* Compare packed bytes.  */
47#  define VPCMPEQ	vpcmpeqb
48/* Compare packed bytes and store minimum.  */
49#  define VPMINU	vpminub
50/* 1 byte char == 1 byte.  */
51#  define SIZE_OF_CHAR	1
52# endif
53
54# ifndef VZEROUPPER
55#  define VZEROUPPER	vzeroupper
56# endif
57
58# ifndef SECTION
59#  define SECTION(p)	p##.avx
60# endif
61
62/* Warning!
63           wcscmp/wcsncmp have to use SIGNED comparison for elements.
64           strcmp/strncmp have to use UNSIGNED comparison for elements.
65*/
66
67/* The main idea of the string comparison (byte or dword) using AVX2
68   consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
69   either packed bytes or dwords depending on USE_AS_WCSCMP. In order
70   to check the null char, algorithm keeps the matched bytes/dwords,
71   requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
72   the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
73   one VPMINU instructions, together with movdqu and testl instructions.
74   Main loop (away from from page boundary) compares 4 vectors are a time,
75   effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
76
77   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
78   is the same as strcmp, except that an a maximum offset is tracked.  If
79   the maximum offset is reached before a difference is found, zero is
80   returned.  */
81
82	.section SECTION(.text),"ax",@progbits
83ENTRY (STRCMP)
84# ifdef USE_AS_STRNCMP
85	/* Check for simple cases (0 or 1) in offset.  */
86	cmp	$1, %RDX_LP
87	je	L(char0)
88	jb	L(zero)
89#  ifdef USE_AS_WCSCMP
90	/* Convert units: from wide to byte char.  */
91	shl	$2, %RDX_LP
92#  endif
93	/* Register %r11 tracks the maximum offset.  */
94	mov	%RDX_LP, %R11_LP
95# endif
96	movl	%edi, %eax
97	xorl	%edx, %edx
98	/* Make %xmm7 (%ymm7) all zeros in this function.  */
99	vpxor	%xmm7, %xmm7, %xmm7
100	orl	%esi, %eax
101	andl	$(PAGE_SIZE - 1), %eax
102	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
103	jg	L(cross_page)
104	/* Start comparing 4 vectors.  */
105	vmovdqu	(%rdi), %ymm1
106	VPCMPEQ	(%rsi), %ymm1, %ymm0
107	VPMINU	%ymm1, %ymm0, %ymm0
108	VPCMPEQ	%ymm7, %ymm0, %ymm0
109	vpmovmskb %ymm0, %ecx
110	testl	%ecx, %ecx
111	je	L(next_3_vectors)
112	tzcntl	%ecx, %edx
113# ifdef USE_AS_STRNCMP
114	/* Return 0 if the mismatched index (%rdx) is after the maximum
115	   offset (%r11).   */
116	cmpq	%r11, %rdx
117	jae	L(zero)
118# endif
119# ifdef USE_AS_WCSCMP
120	xorl	%eax, %eax
121	movl	(%rdi, %rdx), %ecx
122	cmpl	(%rsi, %rdx), %ecx
123	je	L(return)
124L(wcscmp_return):
125	setl	%al
126	negl	%eax
127	orl	$1, %eax
128L(return):
129# else
130	movzbl	(%rdi, %rdx), %eax
131	movzbl	(%rsi, %rdx), %edx
132	subl	%edx, %eax
133# endif
134L(return_vzeroupper):
135	ZERO_UPPER_VEC_REGISTERS_RETURN
136
137	.p2align 4
138L(return_vec_size):
139	tzcntl	%ecx, %edx
140# ifdef USE_AS_STRNCMP
141	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
142	   the maximum offset (%r11).  */
143	addq	$VEC_SIZE, %rdx
144	cmpq	%r11, %rdx
145	jae	L(zero)
146#  ifdef USE_AS_WCSCMP
147	xorl	%eax, %eax
148	movl	(%rdi, %rdx), %ecx
149	cmpl	(%rsi, %rdx), %ecx
150	jne	L(wcscmp_return)
151#  else
152	movzbl	(%rdi, %rdx), %eax
153	movzbl	(%rsi, %rdx), %edx
154	subl	%edx, %eax
155#  endif
156# else
157#  ifdef USE_AS_WCSCMP
158	xorl	%eax, %eax
159	movl	VEC_SIZE(%rdi, %rdx), %ecx
160	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
161	jne	L(wcscmp_return)
162#  else
163	movzbl	VEC_SIZE(%rdi, %rdx), %eax
164	movzbl	VEC_SIZE(%rsi, %rdx), %edx
165	subl	%edx, %eax
166#  endif
167# endif
168	VZEROUPPER_RETURN
169
170	.p2align 4
171L(return_2_vec_size):
172	tzcntl	%ecx, %edx
173# ifdef USE_AS_STRNCMP
174	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
175	   after the maximum offset (%r11).  */
176	addq	$(VEC_SIZE * 2), %rdx
177	cmpq	%r11, %rdx
178	jae	L(zero)
179#  ifdef USE_AS_WCSCMP
180	xorl	%eax, %eax
181	movl	(%rdi, %rdx), %ecx
182	cmpl	(%rsi, %rdx), %ecx
183	jne	L(wcscmp_return)
184#  else
185	movzbl	(%rdi, %rdx), %eax
186	movzbl	(%rsi, %rdx), %edx
187	subl	%edx, %eax
188#  endif
189# else
190#  ifdef USE_AS_WCSCMP
191	xorl	%eax, %eax
192	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
193	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
194	jne	L(wcscmp_return)
195#  else
196	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
197	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
198	subl	%edx, %eax
199#  endif
200# endif
201	VZEROUPPER_RETURN
202
203	.p2align 4
204L(return_3_vec_size):
205	tzcntl	%ecx, %edx
206# ifdef USE_AS_STRNCMP
207	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
208	   after the maximum offset (%r11).  */
209	addq	$(VEC_SIZE * 3), %rdx
210	cmpq	%r11, %rdx
211	jae	L(zero)
212#  ifdef USE_AS_WCSCMP
213	xorl	%eax, %eax
214	movl	(%rdi, %rdx), %ecx
215	cmpl	(%rsi, %rdx), %ecx
216	jne	L(wcscmp_return)
217#  else
218	movzbl	(%rdi, %rdx), %eax
219	movzbl	(%rsi, %rdx), %edx
220	subl	%edx, %eax
221#  endif
222# else
223#  ifdef USE_AS_WCSCMP
224	xorl	%eax, %eax
225	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
226	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
227	jne	L(wcscmp_return)
228#  else
229	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
230	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
231	subl	%edx, %eax
232#  endif
233# endif
234	VZEROUPPER_RETURN
235
236	.p2align 4
237L(next_3_vectors):
238	vmovdqu	VEC_SIZE(%rdi), %ymm6
239	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
240	VPMINU	%ymm6, %ymm3, %ymm3
241	VPCMPEQ	%ymm7, %ymm3, %ymm3
242	vpmovmskb %ymm3, %ecx
243	testl	%ecx, %ecx
244	jne	L(return_vec_size)
245	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
246	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
247	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
248	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
249	VPMINU	%ymm5, %ymm2, %ymm2
250	VPCMPEQ	%ymm4, %ymm0, %ymm0
251	VPCMPEQ	%ymm7, %ymm2, %ymm2
252	vpmovmskb %ymm2, %ecx
253	testl	%ecx, %ecx
254	jne	L(return_2_vec_size)
255	VPMINU	%ymm4, %ymm0, %ymm0
256	VPCMPEQ	%ymm7, %ymm0, %ymm0
257	vpmovmskb %ymm0, %ecx
258	testl	%ecx, %ecx
259	jne	L(return_3_vec_size)
260L(main_loop_header):
261	leaq	(VEC_SIZE * 4)(%rdi), %rdx
262	movl	$PAGE_SIZE, %ecx
263	/* Align load via RAX.  */
264	andq	$-(VEC_SIZE * 4), %rdx
265	subq	%rdi, %rdx
266	leaq	(%rdi, %rdx), %rax
267# ifdef USE_AS_STRNCMP
268	/* Starting from this point, the maximum offset, or simply the
269	   'offset', DECREASES by the same amount when base pointers are
270	   moved forward.  Return 0 when:
271	     1) On match: offset <= the matched vector index.
272	     2) On mistmach, offset is before the mistmatched index.
273	 */
274	subq	%rdx, %r11
275	jbe	L(zero)
276# endif
277	addq	%rsi, %rdx
278	movq	%rdx, %rsi
279	andl	$(PAGE_SIZE - 1), %esi
280	/* Number of bytes before page crossing.  */
281	subq	%rsi, %rcx
282	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
283	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
284	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
285	movl	%ecx, %esi
286	jmp	L(loop_start)
287
288	.p2align 4
289L(loop):
290# ifdef USE_AS_STRNCMP
291	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
292	   the maximum offset (%r11) by the same amount.  */
293	subq	$(VEC_SIZE * 4), %r11
294	jbe	L(zero)
295# endif
296	addq	$(VEC_SIZE * 4), %rax
297	addq	$(VEC_SIZE * 4), %rdx
298L(loop_start):
299	testl	%esi, %esi
300	leal	-1(%esi), %esi
301	je	L(loop_cross_page)
302L(back_to_loop):
303	/* Main loop, comparing 4 vectors are a time.  */
304	vmovdqa	(%rax), %ymm0
305	vmovdqa	VEC_SIZE(%rax), %ymm3
306	VPCMPEQ	(%rdx), %ymm0, %ymm4
307	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
308	VPMINU	%ymm0, %ymm4, %ymm4
309	VPMINU	%ymm3, %ymm1, %ymm1
310	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
311	VPMINU	%ymm1, %ymm4, %ymm0
312	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
313	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
314	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
315	VPMINU	%ymm2, %ymm5, %ymm5
316	VPMINU	%ymm3, %ymm6, %ymm6
317	VPMINU	%ymm5, %ymm0, %ymm0
318	VPMINU	%ymm6, %ymm0, %ymm0
319	VPCMPEQ	%ymm7, %ymm0, %ymm0
320
321	/* Test each mask (32 bits) individually because for VEC_SIZE
322	   == 32 is not possible to OR the four masks and keep all bits
323	   in a 64-bit integer register, differing from SSE2 strcmp
324	   where ORing is possible.  */
325	vpmovmskb %ymm0, %ecx
326	testl	%ecx, %ecx
327	je	L(loop)
328	VPCMPEQ	%ymm7, %ymm4, %ymm0
329	vpmovmskb %ymm0, %edi
330	testl	%edi, %edi
331	je	L(test_vec)
332	tzcntl	%edi, %ecx
333# ifdef USE_AS_STRNCMP
334	cmpq	%rcx, %r11
335	jbe	L(zero)
336#  ifdef USE_AS_WCSCMP
337	movq	%rax, %rsi
338	xorl	%eax, %eax
339	movl	(%rsi, %rcx), %edi
340	cmpl	(%rdx, %rcx), %edi
341	jne	L(wcscmp_return)
342#  else
343	movzbl	(%rax, %rcx), %eax
344	movzbl	(%rdx, %rcx), %edx
345	subl	%edx, %eax
346#  endif
347# else
348#  ifdef USE_AS_WCSCMP
349	movq	%rax, %rsi
350	xorl	%eax, %eax
351	movl	(%rsi, %rcx), %edi
352	cmpl	(%rdx, %rcx), %edi
353	jne	L(wcscmp_return)
354#  else
355	movzbl	(%rax, %rcx), %eax
356	movzbl	(%rdx, %rcx), %edx
357	subl	%edx, %eax
358#  endif
359# endif
360	VZEROUPPER_RETURN
361
362	.p2align 4
363L(test_vec):
364# ifdef USE_AS_STRNCMP
365	/* The first vector matched.  Return 0 if the maximum offset
366	   (%r11) <= VEC_SIZE.  */
367	cmpq	$VEC_SIZE, %r11
368	jbe	L(zero)
369# endif
370	VPCMPEQ	%ymm7, %ymm1, %ymm1
371	vpmovmskb %ymm1, %ecx
372	testl	%ecx, %ecx
373	je	L(test_2_vec)
374	tzcntl	%ecx, %edi
375# ifdef USE_AS_STRNCMP
376	addq	$VEC_SIZE, %rdi
377	cmpq	%rdi, %r11
378	jbe	L(zero)
379#  ifdef USE_AS_WCSCMP
380	movq	%rax, %rsi
381	xorl	%eax, %eax
382	movl	(%rsi, %rdi), %ecx
383	cmpl	(%rdx, %rdi), %ecx
384	jne	L(wcscmp_return)
385#  else
386	movzbl	(%rax, %rdi), %eax
387	movzbl	(%rdx, %rdi), %edx
388	subl	%edx, %eax
389#  endif
390# else
391#  ifdef USE_AS_WCSCMP
392	movq	%rax, %rsi
393	xorl	%eax, %eax
394	movl	VEC_SIZE(%rsi, %rdi), %ecx
395	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
396	jne	L(wcscmp_return)
397#  else
398	movzbl	VEC_SIZE(%rax, %rdi), %eax
399	movzbl	VEC_SIZE(%rdx, %rdi), %edx
400	subl	%edx, %eax
401#  endif
402# endif
403	VZEROUPPER_RETURN
404
405	.p2align 4
406L(test_2_vec):
407# ifdef USE_AS_STRNCMP
408	/* The first 2 vectors matched.  Return 0 if the maximum offset
409	   (%r11) <= 2 * VEC_SIZE.  */
410	cmpq	$(VEC_SIZE * 2), %r11
411	jbe	L(zero)
412# endif
413	VPCMPEQ	%ymm7, %ymm5, %ymm5
414	vpmovmskb %ymm5, %ecx
415	testl	%ecx, %ecx
416	je	L(test_3_vec)
417	tzcntl	%ecx, %edi
418# ifdef USE_AS_STRNCMP
419	addq	$(VEC_SIZE * 2), %rdi
420	cmpq	%rdi, %r11
421	jbe	L(zero)
422#  ifdef USE_AS_WCSCMP
423	movq	%rax, %rsi
424	xorl	%eax, %eax
425	movl	(%rsi, %rdi), %ecx
426	cmpl	(%rdx, %rdi), %ecx
427	jne	L(wcscmp_return)
428#  else
429	movzbl	(%rax, %rdi), %eax
430	movzbl	(%rdx, %rdi), %edx
431	subl	%edx, %eax
432#  endif
433# else
434#  ifdef USE_AS_WCSCMP
435	movq	%rax, %rsi
436	xorl	%eax, %eax
437	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
438	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
439	jne	L(wcscmp_return)
440#  else
441	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
442	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
443	subl	%edx, %eax
444#  endif
445# endif
446	VZEROUPPER_RETURN
447
448	.p2align 4
449L(test_3_vec):
450# ifdef USE_AS_STRNCMP
451	/* The first 3 vectors matched.  Return 0 if the maximum offset
452	   (%r11) <= 3 * VEC_SIZE.  */
453	cmpq	$(VEC_SIZE * 3), %r11
454	jbe	L(zero)
455# endif
456	VPCMPEQ	%ymm7, %ymm6, %ymm6
457	vpmovmskb %ymm6, %esi
458	tzcntl	%esi, %ecx
459# ifdef USE_AS_STRNCMP
460	addq	$(VEC_SIZE * 3), %rcx
461	cmpq	%rcx, %r11
462	jbe	L(zero)
463#  ifdef USE_AS_WCSCMP
464	movq	%rax, %rsi
465	xorl	%eax, %eax
466	movl	(%rsi, %rcx), %esi
467	cmpl	(%rdx, %rcx), %esi
468	jne	L(wcscmp_return)
469#  else
470	movzbl	(%rax, %rcx), %eax
471	movzbl	(%rdx, %rcx), %edx
472	subl	%edx, %eax
473#  endif
474# else
475#  ifdef USE_AS_WCSCMP
476	movq	%rax, %rsi
477	xorl	%eax, %eax
478	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
479	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
480	jne	L(wcscmp_return)
481#  else
482	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
483	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
484	subl	%edx, %eax
485#  endif
486# endif
487	VZEROUPPER_RETURN
488
489	.p2align 4
490L(loop_cross_page):
491	xorl	%r10d, %r10d
492	movq	%rdx, %rcx
493	/* Align load via RDX.  We load the extra ECX bytes which should
494	   be ignored.  */
495	andl	$((VEC_SIZE * 4) - 1), %ecx
496	/* R10 is -RCX.  */
497	subq	%rcx, %r10
498
499	/* This works only if VEC_SIZE * 2 == 64. */
500# if (VEC_SIZE * 2) != 64
501#  error (VEC_SIZE * 2) != 64
502# endif
503
504	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
505	cmpl	$(VEC_SIZE * 2), %ecx
506	jge	L(loop_cross_page_2_vec)
507
508	vmovdqu	(%rax, %r10), %ymm2
509	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
510	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
511	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
512	VPMINU	%ymm2, %ymm0, %ymm0
513	VPMINU	%ymm3, %ymm1, %ymm1
514	VPCMPEQ	%ymm7, %ymm0, %ymm0
515	VPCMPEQ	%ymm7, %ymm1, %ymm1
516
517	vpmovmskb %ymm0, %edi
518	vpmovmskb %ymm1, %esi
519
520	salq	$32, %rsi
521	xorq	%rsi, %rdi
522
523	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
524	shrq	%cl, %rdi
525
526	testq	%rdi, %rdi
527	je	L(loop_cross_page_2_vec)
528	tzcntq	%rdi, %rcx
529# ifdef USE_AS_STRNCMP
530	cmpq	%rcx, %r11
531	jbe	L(zero)
532#  ifdef USE_AS_WCSCMP
533	movq	%rax, %rsi
534	xorl	%eax, %eax
535	movl	(%rsi, %rcx), %edi
536	cmpl	(%rdx, %rcx), %edi
537	jne	L(wcscmp_return)
538#  else
539	movzbl	(%rax, %rcx), %eax
540	movzbl	(%rdx, %rcx), %edx
541	subl	%edx, %eax
542#  endif
543# else
544#  ifdef USE_AS_WCSCMP
545	movq	%rax, %rsi
546	xorl	%eax, %eax
547	movl	(%rsi, %rcx), %edi
548	cmpl	(%rdx, %rcx), %edi
549	jne	L(wcscmp_return)
550#  else
551	movzbl	(%rax, %rcx), %eax
552	movzbl	(%rdx, %rcx), %edx
553	subl	%edx, %eax
554#  endif
555# endif
556	VZEROUPPER_RETURN
557
558	.p2align 4
559L(loop_cross_page_2_vec):
560	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
561	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
562	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
563	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
564	VPMINU	%ymm2, %ymm5, %ymm5
565	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
566	VPCMPEQ	%ymm7, %ymm5, %ymm5
567	VPMINU	%ymm3, %ymm6, %ymm6
568	VPCMPEQ	%ymm7, %ymm6, %ymm6
569
570	vpmovmskb %ymm5, %edi
571	vpmovmskb %ymm6, %esi
572
573	salq	$32, %rsi
574	xorq	%rsi, %rdi
575
576	xorl	%r8d, %r8d
577	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
578	subl	$(VEC_SIZE * 2), %ecx
579	jle	1f
580	/* Skip ECX bytes.  */
581	shrq	%cl, %rdi
582	/* R8 has number of bytes skipped.  */
583	movl	%ecx, %r8d
5841:
585	/* Before jumping back to the loop, set ESI to the number of
586	   VEC_SIZE * 4 blocks before page crossing.  */
587	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
588
589	testq	%rdi, %rdi
590# ifdef USE_AS_STRNCMP
591	/* At this point, if %rdi value is 0, it already tested
592	   VEC_SIZE*4+%r10 byte starting from %rax. This label
593	   checks whether strncmp maximum offset reached or not.  */
594	je	L(string_nbyte_offset_check)
595# else
596	je	L(back_to_loop)
597# endif
598	tzcntq	%rdi, %rcx
599	addq	%r10, %rcx
600	/* Adjust for number of bytes skipped.  */
601	addq	%r8, %rcx
602# ifdef USE_AS_STRNCMP
603	addq	$(VEC_SIZE * 2), %rcx
604	subq	%rcx, %r11
605	jbe	L(zero)
606#  ifdef USE_AS_WCSCMP
607	movq	%rax, %rsi
608	xorl	%eax, %eax
609	movl	(%rsi, %rcx), %edi
610	cmpl	(%rdx, %rcx), %edi
611	jne	L(wcscmp_return)
612#  else
613	movzbl	(%rax, %rcx), %eax
614	movzbl	(%rdx, %rcx), %edx
615	subl	%edx, %eax
616#  endif
617# else
618#  ifdef USE_AS_WCSCMP
619	movq	%rax, %rsi
620	xorl	%eax, %eax
621	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
622	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
623	jne	L(wcscmp_return)
624#  else
625	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
626	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
627	subl	%edx, %eax
628#  endif
629# endif
630	VZEROUPPER_RETURN
631
632# ifdef USE_AS_STRNCMP
633L(string_nbyte_offset_check):
634	leaq	(VEC_SIZE * 4)(%r10), %r10
635	cmpq	%r10, %r11
636	jbe	L(zero)
637	jmp	L(back_to_loop)
638# endif
639
640	.p2align 4
641L(cross_page_loop):
642	/* Check one byte/dword at a time.  */
643# ifdef USE_AS_WCSCMP
644	cmpl	%ecx, %eax
645# else
646	subl	%ecx, %eax
647# endif
648	jne	L(different)
649	addl	$SIZE_OF_CHAR, %edx
650	cmpl	$(VEC_SIZE * 4), %edx
651	je	L(main_loop_header)
652# ifdef USE_AS_STRNCMP
653	cmpq	%r11, %rdx
654	jae	L(zero)
655# endif
656# ifdef USE_AS_WCSCMP
657	movl	(%rdi, %rdx), %eax
658	movl	(%rsi, %rdx), %ecx
659# else
660	movzbl	(%rdi, %rdx), %eax
661	movzbl	(%rsi, %rdx), %ecx
662# endif
663	/* Check null char.  */
664	testl	%eax, %eax
665	jne	L(cross_page_loop)
666	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
667	   comparisons.  */
668	subl	%ecx, %eax
669# ifndef USE_AS_WCSCMP
670L(different):
671# endif
672	VZEROUPPER_RETURN
673
674# ifdef USE_AS_WCSCMP
675	.p2align 4
676L(different):
677	/* Use movl to avoid modifying EFLAGS.  */
678	movl	$0, %eax
679	setl	%al
680	negl	%eax
681	orl	$1, %eax
682	VZEROUPPER_RETURN
683# endif
684
685# ifdef USE_AS_STRNCMP
686	.p2align 4
687L(zero):
688	xorl	%eax, %eax
689	VZEROUPPER_RETURN
690
691	.p2align 4
692L(char0):
693#  ifdef USE_AS_WCSCMP
694	xorl	%eax, %eax
695	movl	(%rdi), %ecx
696	cmpl	(%rsi), %ecx
697	jne	L(wcscmp_return)
698#  else
699	movzbl	(%rsi), %ecx
700	movzbl	(%rdi), %eax
701	subl	%ecx, %eax
702#  endif
703	VZEROUPPER_RETURN
704# endif
705
706	.p2align 4
707L(last_vector):
708	addq	%rdx, %rdi
709	addq	%rdx, %rsi
710# ifdef USE_AS_STRNCMP
711	subq	%rdx, %r11
712# endif
713	tzcntl	%ecx, %edx
714# ifdef USE_AS_STRNCMP
715	cmpq	%r11, %rdx
716	jae	L(zero)
717# endif
718# ifdef USE_AS_WCSCMP
719	xorl	%eax, %eax
720	movl	(%rdi, %rdx), %ecx
721	cmpl	(%rsi, %rdx), %ecx
722	jne	L(wcscmp_return)
723# else
724	movzbl	(%rdi, %rdx), %eax
725	movzbl	(%rsi, %rdx), %edx
726	subl	%edx, %eax
727# endif
728	VZEROUPPER_RETURN
729
730	/* Comparing on page boundary region requires special treatment:
731	   It must done one vector at the time, starting with the wider
732	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
733	   (xmm) still passes the boundary, byte comparison must be done.
734	 */
735	.p2align 4
736L(cross_page):
737	/* Try one ymm vector at a time.  */
738	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
739	jg	L(cross_page_1_vector)
740L(loop_1_vector):
741	vmovdqu	(%rdi, %rdx), %ymm1
742	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
743	VPMINU	%ymm1, %ymm0, %ymm0
744	VPCMPEQ	%ymm7, %ymm0, %ymm0
745	vpmovmskb %ymm0, %ecx
746	testl	%ecx, %ecx
747	jne	L(last_vector)
748
749	addl	$VEC_SIZE, %edx
750
751	addl	$VEC_SIZE, %eax
752# ifdef USE_AS_STRNCMP
753	/* Return 0 if the current offset (%rdx) >= the maximum offset
754	   (%r11).  */
755	cmpq	%r11, %rdx
756	jae	L(zero)
757# endif
758	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
759	jle	L(loop_1_vector)
760L(cross_page_1_vector):
761	/* Less than 32 bytes to check, try one xmm vector.  */
762	cmpl	$(PAGE_SIZE - 16), %eax
763	jg	L(cross_page_1_xmm)
764	vmovdqu	(%rdi, %rdx), %xmm1
765	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
766	VPMINU	%xmm1, %xmm0, %xmm0
767	VPCMPEQ	%xmm7, %xmm0, %xmm0
768	vpmovmskb %xmm0, %ecx
769	testl	%ecx, %ecx
770	jne	L(last_vector)
771
772	addl	$16, %edx
773# ifndef USE_AS_WCSCMP
774	addl	$16, %eax
775# endif
776# ifdef USE_AS_STRNCMP
777	/* Return 0 if the current offset (%rdx) >= the maximum offset
778	   (%r11).  */
779	cmpq	%r11, %rdx
780	jae	L(zero)
781# endif
782
783L(cross_page_1_xmm):
784# ifndef USE_AS_WCSCMP
785	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
786	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
787	cmpl	$(PAGE_SIZE - 8), %eax
788	jg	L(cross_page_8bytes)
789	vmovq	(%rdi, %rdx), %xmm1
790	vmovq	(%rsi, %rdx), %xmm0
791	VPCMPEQ	%xmm0, %xmm1, %xmm0
792	VPMINU	%xmm1, %xmm0, %xmm0
793	VPCMPEQ	%xmm7, %xmm0, %xmm0
794	vpmovmskb %xmm0, %ecx
795	/* Only last 8 bits are valid.  */
796	andl	$0xff, %ecx
797	testl	%ecx, %ecx
798	jne	L(last_vector)
799
800	addl	$8, %edx
801	addl	$8, %eax
802#  ifdef USE_AS_STRNCMP
803	/* Return 0 if the current offset (%rdx) >= the maximum offset
804	   (%r11).  */
805	cmpq	%r11, %rdx
806	jae	L(zero)
807#  endif
808
809L(cross_page_8bytes):
810	/* Less than 8 bytes to check, try 4 byte vector.  */
811	cmpl	$(PAGE_SIZE - 4), %eax
812	jg	L(cross_page_4bytes)
813	vmovd	(%rdi, %rdx), %xmm1
814	vmovd	(%rsi, %rdx), %xmm0
815	VPCMPEQ	%xmm0, %xmm1, %xmm0
816	VPMINU	%xmm1, %xmm0, %xmm0
817	VPCMPEQ	%xmm7, %xmm0, %xmm0
818	vpmovmskb %xmm0, %ecx
819	/* Only last 4 bits are valid.  */
820	andl	$0xf, %ecx
821	testl	%ecx, %ecx
822	jne	L(last_vector)
823
824	addl	$4, %edx
825#  ifdef USE_AS_STRNCMP
826	/* Return 0 if the current offset (%rdx) >= the maximum offset
827	   (%r11).  */
828	cmpq	%r11, %rdx
829	jae	L(zero)
830#  endif
831
832L(cross_page_4bytes):
833# endif
834	/* Less than 4 bytes to check, try one byte/dword at a time.  */
835# ifdef USE_AS_STRNCMP
836	cmpq	%r11, %rdx
837	jae	L(zero)
838# endif
839# ifdef USE_AS_WCSCMP
840	movl	(%rdi, %rdx), %eax
841	movl	(%rsi, %rdx), %ecx
842# else
843	movzbl	(%rdi, %rdx), %eax
844	movzbl	(%rsi, %rdx), %ecx
845# endif
846	testl	%eax, %eax
847	jne	L(cross_page_loop)
848	subl	%ecx, %eax
849	VZEROUPPER_RETURN
850END (STRCMP)
851#endif
852