1/* strcmp with SSE4.2
2   Copyright (C) 2009-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef STRCMP_SSE42
22# define STRCMP_SSE42	__strcmp_sse42
23#endif
24
25#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
26# include "locale-defines.h"
27#endif
28
29#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
30/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
31   if the new counter > the old one or is 0.  */
32# define UPDATE_STRNCMP_COUNTER				\
33	/* calculate left number to compare */		\
34	lea	-16(%rcx, %r11), %r9;			\
35	cmp	%r9, %r11;				\
36	jb	LABEL(strcmp_exitz);			\
37	test	%r9, %r9;				\
38	je	LABEL(strcmp_exitz);			\
39	mov	%r9, %r11
40#else
41# define UPDATE_STRNCMP_COUNTER
42#endif
43
44#ifdef USE_AVX
45# define SECTION	avx
46# define GLABEL(l)	l##_avx
47#else
48# define SECTION	sse4.2
49# define GLABEL(l)	l##_sse42
50#endif
51
52#define LABEL(l)	.L##l
53
54/* We use 0x1a:
55	_SIDD_SBYTE_OPS
56	| _SIDD_CMP_EQUAL_EACH
57	| _SIDD_NEGATIVE_POLARITY
58	| _SIDD_LEAST_SIGNIFICANT
59   on pcmpistri to find out if two 16byte data elements are the same
60   and the offset of the first different byte.  There are 4 cases:
61
62   1. Both 16byte data elements are valid and identical.
63   2. Both 16byte data elements have EOS and identical.
64   3. Both 16byte data elements are valid and they differ at offset X.
65   4. At least one 16byte data element has EOS at offset X.  Two 16byte
66      data elements must differ at or before offset X.
67
68   Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
69
70   case		ECX	CFlag	ZFlag	SFlag
71    1		16	  0	  0	  0
72    2		16	  0	  1	  1
73    3		 X	  1	  0	  0
74    4	       0 <= X	  1	 0/1	 0/1
75
76   We exit from the loop for cases 2, 3 and 4 with jbe which branches
77   when either CFlag or ZFlag is 1.  If CFlag == 0, we return 0 for
78   case 2.  */
79
80	/* Put all SSE 4.2 functions together.  */
81	.section .text.SECTION,"ax",@progbits
82	.align	16
83	.type	STRCMP_SSE42, @function
84	.globl	STRCMP_SSE42
85	.hidden	STRCMP_SSE42
86#ifdef USE_AS_STRCASECMP_L
87ENTRY (GLABEL(__strcasecmp))
88	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
89	mov	%fs:(%rax),%RDX_LP
90
91	// XXX 5 byte should be before the function
92	/* 5-byte NOP.  */
93	.byte	0x0f,0x1f,0x44,0x00,0x00
94END (GLABEL(__strcasecmp))
95	/* FALLTHROUGH to strcasecmp_l.  */
96#endif
97#ifdef USE_AS_STRNCASECMP_L
98ENTRY (GLABEL(__strncasecmp))
99	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
100	mov	%fs:(%rax),%RCX_LP
101
102	// XXX 5 byte should be before the function
103	/* 5-byte NOP.  */
104	.byte	0x0f,0x1f,0x44,0x00,0x00
105END (GLABEL(__strncasecmp))
106	/* FALLTHROUGH to strncasecmp_l.  */
107#endif
108
109
110#ifdef USE_AVX
111# define movdqa vmovdqa
112# define movdqu vmovdqu
113# define pmovmskb vpmovmskb
114# define pcmpistri vpcmpistri
115# define psubb vpsubb
116# define pcmpeqb vpcmpeqb
117# define psrldq vpsrldq
118# define pslldq vpslldq
119# define palignr vpalignr
120# define pxor vpxor
121# define D(arg) arg, arg
122#else
123# define D(arg) arg
124#endif
125
126STRCMP_SSE42:
127	cfi_startproc
128	_CET_ENDBR
129	CALL_MCOUNT
130
131/*
132 * This implementation uses SSE to compare up to 16 bytes at a time.
133 */
134#ifdef USE_AS_STRCASECMP_L
135	/* We have to fall back on the C implementation for locales
136	   with encodings not matching ASCII for single bytes.  */
137# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
138	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
139# else
140	mov	(%rdx), %RAX_LP
141# endif
142	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
143	jne	__strcasecmp_l_nonascii
144#endif
145#ifdef USE_AS_STRNCASECMP_L
146	/* We have to fall back on the C implementation for locales
147	   with encodings not matching ASCII for single bytes.  */
148# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
149	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
150# else
151	mov	(%rcx), %RAX_LP
152# endif
153	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
154	jne	__strncasecmp_l_nonascii
155#endif
156
157#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
158	test	%RDX_LP, %RDX_LP
159	je	LABEL(strcmp_exitz)
160	cmp	$1, %RDX_LP
161	je	LABEL(Byte0)
162	mov	%RDX_LP, %R11_LP
163#endif
164	mov	%esi, %ecx
165	mov	%edi, %eax
166/* Use 64bit AND here to avoid long NOP padding.  */
167	and	$0x3f, %rcx		/* rsi alignment in cache line */
168	and	$0x3f, %rax		/* rdi alignment in cache line */
169#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
170	.section .rodata.cst16,"aM",@progbits,16
171	.align 16
172LABEL(belowupper):
173	.quad	0x4040404040404040
174	.quad	0x4040404040404040
175LABEL(topupper):
176# ifdef USE_AVX
177	.quad	0x5a5a5a5a5a5a5a5a
178	.quad	0x5a5a5a5a5a5a5a5a
179# else
180	.quad	0x5b5b5b5b5b5b5b5b
181	.quad	0x5b5b5b5b5b5b5b5b
182# endif
183LABEL(touppermask):
184	.quad	0x2020202020202020
185	.quad	0x2020202020202020
186	.previous
187	movdqa	LABEL(belowupper)(%rip), %xmm4
188# define UCLOW_reg %xmm4
189	movdqa	LABEL(topupper)(%rip), %xmm5
190# define UCHIGH_reg %xmm5
191	movdqa	LABEL(touppermask)(%rip), %xmm6
192# define LCQWORD_reg %xmm6
193#endif
194	cmp	$0x30, %ecx
195	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
196	cmp	$0x30, %eax
197	ja	LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
198	movdqu	(%rdi), %xmm1
199	movdqu	(%rsi), %xmm2
200#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
201# ifdef USE_AVX
202#  define TOLOWER(reg1, reg2) \
203	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
204	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
205	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
206	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
207	vpandn	%xmm7, %xmm8, %xmm8;					\
208	vpandn	%xmm9, %xmm10, %xmm10;					\
209	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
210	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
211	vpor	reg1, %xmm8, reg1;					\
212	vpor	reg2, %xmm10, reg2
213# else
214#  define TOLOWER(reg1, reg2) \
215	movdqa	reg1, %xmm7;					\
216	movdqa	UCHIGH_reg, %xmm8;				\
217	movdqa	reg2, %xmm9;					\
218	movdqa	UCHIGH_reg, %xmm10;				\
219	pcmpgtb	UCLOW_reg, %xmm7;				\
220	pcmpgtb	reg1, %xmm8;					\
221	pcmpgtb	UCLOW_reg, %xmm9;				\
222	pcmpgtb	reg2, %xmm10;					\
223	pand	%xmm8, %xmm7;					\
224	pand	%xmm10, %xmm9;					\
225	pand	LCQWORD_reg, %xmm7;				\
226	pand	LCQWORD_reg, %xmm9;				\
227	por	%xmm7, reg1;					\
228	por	%xmm9, reg2
229# endif
230	TOLOWER (%xmm1, %xmm2)
231#else
232# define TOLOWER(reg1, reg2)
233#endif
234	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
235	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
236	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
237	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
238	pmovmskb %xmm1, %edx
239	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
240	jnz	LABEL(less16bytes)/* If not, find different value or null char */
241#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
242	sub	$16, %r11
243	jbe	LABEL(strcmp_exitz)/* finish comparison */
244#endif
245	add	$16, %rsi		/* prepare to search next 16 bytes */
246	add	$16, %rdi		/* prepare to search next 16 bytes */
247
248	/*
249	 * Determine source and destination string offsets from 16-byte
250	 * alignment.  Use relative offset difference between the two to
251	 * determine which case below to use.
252	 */
253	.p2align 4
254LABEL(crosscache):
255	and	$0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
256	and	$0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
257	mov	$0xffff, %edx		/* for equivalent offset */
258	xor	%r8d, %r8d
259	and	$0xf, %ecx		/* offset of rsi */
260	and	$0xf, %eax		/* offset of rdi */
261	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
262	cmp	%eax, %ecx
263	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
264	ja	LABEL(bigger)
265	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
266	xchg	%ecx, %eax
267	xchg	%rsi, %rdi
268LABEL(bigger):
269	movdqa	(%rdi), %xmm2
270	movdqa	(%rsi), %xmm1
271	lea	15(%rax), %r9
272	sub	%rcx, %r9
273	lea	LABEL(unaligned_table)(%rip), %r10
274	movslq	(%r10, %r9,4), %r9
275	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
276	lea	(%r10, %r9), %r10
277	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
278
279/*
280 * The following cases will be handled by ashr_0
281 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
282 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
283 */
284	.p2align 4
285LABEL(ashr_0):
286
287	movdqa	(%rsi), %xmm1
288	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
289#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
290	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
291#else
292	movdqa	(%rdi), %xmm2
293	TOLOWER (%xmm1, %xmm2)
294	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
295#endif
296	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
297	pmovmskb %xmm1, %r9d
298	shr	%cl, %edx		/* adjust 0xffff for offset */
299	shr	%cl, %r9d		/* adjust for 16-byte offset */
300	sub	%r9d, %edx
301	/*
302	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
303	 * the start from (16-rax) and no null char was seen.
304	 */
305	jne	LABEL(less32bytes)	/* mismatch or null char */
306	UPDATE_STRNCMP_COUNTER
307	mov	$16, %rcx
308	mov	$16, %r9
309
310	/*
311	 * Now both strings are aligned at 16-byte boundary. Loop over strings
312	 * checking 32-bytes per iteration.
313	 */
314	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
315	.p2align 4
316LABEL(ashr_0_use):
317	movdqa	(%rdi,%rdx), %xmm0
318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
319	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
320#else
321	movdqa	(%rsi,%rdx), %xmm1
322	TOLOWER (%xmm0, %xmm1)
323	pcmpistri $0x1a, %xmm1, %xmm0
324#endif
325	lea	16(%rdx), %rdx
326	jbe	LABEL(ashr_0_exit_use)
327#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
328	sub	$16, %r11
329	jbe	LABEL(strcmp_exitz)
330#endif
331
332	movdqa	(%rdi,%rdx), %xmm0
333#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
334	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
335#else
336	movdqa	(%rsi,%rdx), %xmm1
337	TOLOWER (%xmm0, %xmm1)
338	pcmpistri $0x1a, %xmm1, %xmm0
339#endif
340	lea	16(%rdx), %rdx
341	jbe	LABEL(ashr_0_exit_use)
342#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
343	sub	$16, %r11
344	jbe	LABEL(strcmp_exitz)
345#endif
346	jmp	LABEL(ashr_0_use)
347
348
349	.p2align 4
350LABEL(ashr_0_exit_use):
351	jnc	LABEL(strcmp_exitz)
352#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
353	sub	%rcx, %r11
354	jbe	LABEL(strcmp_exitz)
355#endif
356	lea	-16(%rdx, %rcx), %rcx
357	movzbl	(%rdi, %rcx), %eax
358	movzbl	(%rsi, %rcx), %edx
359#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
360	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
361	movl	(%rcx,%rax,4), %eax
362	movl	(%rcx,%rdx,4), %edx
363#endif
364	sub	%edx, %eax
365	ret
366
367
368
369/*
370 * The following cases will be handled by ashr_1
371 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
372 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
373 */
374	.p2align 4
375LABEL(ashr_1):
376	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
377	TOLOWER (%xmm1, %xmm2)
378	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
379	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
380	pmovmskb %xmm2, %r9d
381	shr	%cl, %edx		/* adjust 0xffff for offset */
382	shr	%cl, %r9d		/* adjust for 16-byte offset */
383	sub	%r9d, %edx
384	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
385	movdqa	(%rdi), %xmm3
386	UPDATE_STRNCMP_COUNTER
387
388	mov	$16, %rcx		/* index for loads*/
389	mov	$1, %r9d		/* byte position left over from less32bytes case */
390	/*
391	 * Setup %r10 value allows us to detect crossing a page boundary.
392	 * When %r10 goes positive we have crossed a page boundary and
393	 * need to do a nibble.
394	 */
395	lea	1(%rdi), %r10
396	and	$0xfff, %r10		/* offset into 4K page */
397	sub	$0x1000, %r10		/* subtract 4K pagesize */
398	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
399
400	.p2align 4
401LABEL(loop_ashr_1_use):
402	add	$16, %r10
403	jg	LABEL(nibble_ashr_1_use)
404
405LABEL(nibble_ashr_1_restart_use):
406	movdqa	(%rdi, %rdx), %xmm0
407	palignr $1, -16(%rdi, %rdx), D(%xmm0)
408#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
409	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
410#else
411	movdqa	(%rsi,%rdx), %xmm1
412	TOLOWER (%xmm0, %xmm1)
413	pcmpistri $0x1a, %xmm1, %xmm0
414#endif
415	jbe	LABEL(exit_use)
416#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
417	sub	$16, %r11
418	jbe	LABEL(strcmp_exitz)
419#endif
420
421	add	$16, %rdx
422	add	$16, %r10
423	jg	LABEL(nibble_ashr_1_use)
424
425	movdqa	(%rdi, %rdx), %xmm0
426	palignr $1, -16(%rdi, %rdx), D(%xmm0)
427#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
428	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
429#else
430	movdqa	(%rsi,%rdx), %xmm1
431	TOLOWER (%xmm0, %xmm1)
432	pcmpistri $0x1a, %xmm1, %xmm0
433#endif
434	jbe	LABEL(exit_use)
435#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
436	sub	$16, %r11
437	jbe	LABEL(strcmp_exitz)
438#endif
439	add	$16, %rdx
440	jmp	LABEL(loop_ashr_1_use)
441
442	.p2align 4
443LABEL(nibble_ashr_1_use):
444	sub	$0x1000, %r10
445	movdqa	-16(%rdi, %rdx), %xmm0
446	psrldq	$1, D(%xmm0)
447	pcmpistri      $0x3a,%xmm0, %xmm0
448#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
449	cmp	%r11, %rcx
450	jae	LABEL(nibble_ashr_exit_use)
451#endif
452	cmp	$14, %ecx
453	ja	LABEL(nibble_ashr_1_restart_use)
454
455	jmp	LABEL(nibble_ashr_exit_use)
456
457/*
458 * The following cases will be handled by ashr_2
459 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
460 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
461 */
462	.p2align 4
463LABEL(ashr_2):
464	pslldq	$14, D(%xmm2)
465	TOLOWER (%xmm1, %xmm2)
466	pcmpeqb	%xmm1, D(%xmm2)
467	psubb	%xmm0, D(%xmm2)
468	pmovmskb %xmm2, %r9d
469	shr	%cl, %edx
470	shr	%cl, %r9d
471	sub	%r9d, %edx
472	jnz	LABEL(less32bytes)
473	movdqa	(%rdi), %xmm3
474	UPDATE_STRNCMP_COUNTER
475
476	mov	$16, %rcx	/* index for loads */
477	mov	$2, %r9d	/* byte position left over from less32bytes case */
478	/*
479	 * Setup %r10 value allows us to detect crossing a page boundary.
480	 * When %r10 goes positive we have crossed a page boundary and
481	 * need to do a nibble.
482	 */
483	lea	2(%rdi), %r10
484	and	$0xfff, %r10	/* offset into 4K page */
485	sub	$0x1000, %r10	/* subtract 4K pagesize */
486	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
487
488	.p2align 4
489LABEL(loop_ashr_2_use):
490	add	$16, %r10
491	jg	LABEL(nibble_ashr_2_use)
492
493LABEL(nibble_ashr_2_restart_use):
494	movdqa	(%rdi, %rdx), %xmm0
495	palignr $2, -16(%rdi, %rdx), D(%xmm0)
496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
497	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
498#else
499	movdqa	(%rsi,%rdx), %xmm1
500	TOLOWER (%xmm0, %xmm1)
501	pcmpistri $0x1a, %xmm1, %xmm0
502#endif
503	jbe	LABEL(exit_use)
504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
505	sub	$16, %r11
506	jbe	LABEL(strcmp_exitz)
507#endif
508
509	add	$16, %rdx
510	add	$16, %r10
511	jg	LABEL(nibble_ashr_2_use)
512
513	movdqa	(%rdi, %rdx), %xmm0
514	palignr $2, -16(%rdi, %rdx), D(%xmm0)
515#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
516	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
517#else
518	movdqa	(%rsi,%rdx), %xmm1
519	TOLOWER (%xmm0, %xmm1)
520	pcmpistri $0x1a, %xmm1, %xmm0
521#endif
522	jbe	LABEL(exit_use)
523#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
524	sub	$16, %r11
525	jbe	LABEL(strcmp_exitz)
526#endif
527	add	$16, %rdx
528	jmp	LABEL(loop_ashr_2_use)
529
530	.p2align 4
531LABEL(nibble_ashr_2_use):
532	sub	$0x1000, %r10
533	movdqa	-16(%rdi, %rdx), %xmm0
534	psrldq	$2, D(%xmm0)
535	pcmpistri      $0x3a,%xmm0, %xmm0
536#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
537	cmp	%r11, %rcx
538	jae	LABEL(nibble_ashr_exit_use)
539#endif
540	cmp	$13, %ecx
541	ja	LABEL(nibble_ashr_2_restart_use)
542
543	jmp	LABEL(nibble_ashr_exit_use)
544
545/*
546 * The following cases will be handled by ashr_3
547 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
548 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
549 */
550	.p2align 4
551LABEL(ashr_3):
552	pslldq	$13, D(%xmm2)
553	TOLOWER (%xmm1, %xmm2)
554	pcmpeqb	%xmm1, D(%xmm2)
555	psubb	%xmm0, D(%xmm2)
556	pmovmskb %xmm2, %r9d
557	shr	%cl, %edx
558	shr	%cl, %r9d
559	sub	%r9d, %edx
560	jnz	LABEL(less32bytes)
561	movdqa	(%rdi), %xmm3
562
563	UPDATE_STRNCMP_COUNTER
564
565	mov	$16, %rcx	/* index for loads */
566	mov	$3, %r9d	/* byte position left over from less32bytes case */
567	/*
568	 * Setup %r10 value allows us to detect crossing a page boundary.
569	 * When %r10 goes positive we have crossed a page boundary and
570	 * need to do a nibble.
571	 */
572	lea	3(%rdi), %r10
573	and	$0xfff, %r10	/* offset into 4K page */
574	sub	$0x1000, %r10	/* subtract 4K pagesize */
575	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
576
577LABEL(loop_ashr_3_use):
578	add	$16, %r10
579	jg	LABEL(nibble_ashr_3_use)
580
581LABEL(nibble_ashr_3_restart_use):
582	movdqa	(%rdi, %rdx), %xmm0
583	palignr $3, -16(%rdi, %rdx), D(%xmm0)
584#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
585	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
586#else
587	movdqa	(%rsi,%rdx), %xmm1
588	TOLOWER (%xmm0, %xmm1)
589	pcmpistri $0x1a, %xmm1, %xmm0
590#endif
591	jbe	LABEL(exit_use)
592#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
593	sub	$16, %r11
594	jbe	LABEL(strcmp_exitz)
595#endif
596
597	add	$16, %rdx
598	add	$16, %r10
599	jg	LABEL(nibble_ashr_3_use)
600
601	movdqa	(%rdi, %rdx), %xmm0
602	palignr $3, -16(%rdi, %rdx), D(%xmm0)
603#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
604	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
605#else
606	movdqa	(%rsi,%rdx), %xmm1
607	TOLOWER (%xmm0, %xmm1)
608	pcmpistri $0x1a, %xmm1, %xmm0
609#endif
610	jbe	LABEL(exit_use)
611#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
612	sub	$16, %r11
613	jbe	LABEL(strcmp_exitz)
614#endif
615	add	$16, %rdx
616	jmp	LABEL(loop_ashr_3_use)
617
618	.p2align 4
619LABEL(nibble_ashr_3_use):
620	sub	$0x1000, %r10
621	movdqa	-16(%rdi, %rdx), %xmm0
622	psrldq	$3, D(%xmm0)
623	pcmpistri      $0x3a,%xmm0, %xmm0
624#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
625	cmp	%r11, %rcx
626	jae	LABEL(nibble_ashr_exit_use)
627#endif
628	cmp	$12, %ecx
629	ja	LABEL(nibble_ashr_3_restart_use)
630
631	jmp	LABEL(nibble_ashr_exit_use)
632
633/*
634 * The following cases will be handled by ashr_4
635 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
636 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
637 */
638	.p2align 4
639LABEL(ashr_4):
640	pslldq	$12, D(%xmm2)
641	TOLOWER (%xmm1, %xmm2)
642	pcmpeqb	%xmm1, D(%xmm2)
643	psubb	%xmm0, D(%xmm2)
644	pmovmskb %xmm2, %r9d
645	shr	%cl, %edx
646	shr	%cl, %r9d
647	sub	%r9d, %edx
648	jnz	LABEL(less32bytes)
649	movdqa	(%rdi), %xmm3
650
651	UPDATE_STRNCMP_COUNTER
652
653	mov	$16, %rcx	/* index for loads */
654	mov	$4, %r9d	/* byte position left over from less32bytes case */
655	/*
656	 * Setup %r10 value allows us to detect crossing a page boundary.
657	 * When %r10 goes positive we have crossed a page boundary and
658	 * need to do a nibble.
659	 */
660	lea	4(%rdi), %r10
661	and	$0xfff, %r10	/* offset into 4K page */
662	sub	$0x1000, %r10	/* subtract 4K pagesize */
663	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
664
665	.p2align 4
666LABEL(loop_ashr_4_use):
667	add	$16, %r10
668	jg	LABEL(nibble_ashr_4_use)
669
670LABEL(nibble_ashr_4_restart_use):
671	movdqa	(%rdi, %rdx), %xmm0
672	palignr $4, -16(%rdi, %rdx), D(%xmm0)
673#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
674	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
675#else
676	movdqa	(%rsi,%rdx), %xmm1
677	TOLOWER (%xmm0, %xmm1)
678	pcmpistri $0x1a, %xmm1, %xmm0
679#endif
680	jbe	LABEL(exit_use)
681#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
682	sub	$16, %r11
683	jbe	LABEL(strcmp_exitz)
684#endif
685
686	add	$16, %rdx
687	add	$16, %r10
688	jg	LABEL(nibble_ashr_4_use)
689
690	movdqa	(%rdi, %rdx), %xmm0
691	palignr $4, -16(%rdi, %rdx), D(%xmm0)
692#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
693	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
694#else
695	movdqa	(%rsi,%rdx), %xmm1
696	TOLOWER (%xmm0, %xmm1)
697	pcmpistri $0x1a, %xmm1, %xmm0
698#endif
699	jbe	LABEL(exit_use)
700#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
701	sub	$16, %r11
702	jbe	LABEL(strcmp_exitz)
703#endif
704	add	$16, %rdx
705	jmp	LABEL(loop_ashr_4_use)
706
707	.p2align 4
708LABEL(nibble_ashr_4_use):
709	sub	$0x1000, %r10
710	movdqa	-16(%rdi, %rdx), %xmm0
711	psrldq	$4, D(%xmm0)
712	pcmpistri      $0x3a,%xmm0, %xmm0
713#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
714	cmp	%r11, %rcx
715	jae	LABEL(nibble_ashr_exit_use)
716#endif
717	cmp	$11, %ecx
718	ja	LABEL(nibble_ashr_4_restart_use)
719
720	jmp	LABEL(nibble_ashr_exit_use)
721
722/*
723 * The following cases will be handled by ashr_5
724 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
725 *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
726 */
727	.p2align 4
728LABEL(ashr_5):
729	pslldq	$11, D(%xmm2)
730	TOLOWER (%xmm1, %xmm2)
731	pcmpeqb	%xmm1, D(%xmm2)
732	psubb	%xmm0, D(%xmm2)
733	pmovmskb %xmm2, %r9d
734	shr	%cl, %edx
735	shr	%cl, %r9d
736	sub	%r9d, %edx
737	jnz	LABEL(less32bytes)
738	movdqa	(%rdi), %xmm3
739
740	UPDATE_STRNCMP_COUNTER
741
742	mov	$16, %rcx	/* index for loads */
743	mov	$5, %r9d	/* byte position left over from less32bytes case */
744	/*
745	 * Setup %r10 value allows us to detect crossing a page boundary.
746	 * When %r10 goes positive we have crossed a page boundary and
747	 * need to do a nibble.
748	 */
749	lea	5(%rdi), %r10
750	and	$0xfff, %r10	/* offset into 4K page */
751	sub	$0x1000, %r10	/* subtract 4K pagesize */
752	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
753
754	.p2align 4
755LABEL(loop_ashr_5_use):
756	add	$16, %r10
757	jg	LABEL(nibble_ashr_5_use)
758
759LABEL(nibble_ashr_5_restart_use):
760	movdqa	(%rdi, %rdx), %xmm0
761	palignr $5, -16(%rdi, %rdx), D(%xmm0)
762#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
763	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
764#else
765	movdqa	(%rsi,%rdx), %xmm1
766	TOLOWER (%xmm0, %xmm1)
767	pcmpistri $0x1a, %xmm1, %xmm0
768#endif
769	jbe	LABEL(exit_use)
770#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
771	sub	$16, %r11
772	jbe	LABEL(strcmp_exitz)
773#endif
774
775	add	$16, %rdx
776	add	$16, %r10
777	jg	LABEL(nibble_ashr_5_use)
778
779	movdqa	(%rdi, %rdx), %xmm0
780
781	palignr $5, -16(%rdi, %rdx), D(%xmm0)
782#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
783	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
784#else
785	movdqa	(%rsi,%rdx), %xmm1
786	TOLOWER (%xmm0, %xmm1)
787	pcmpistri $0x1a, %xmm1, %xmm0
788#endif
789	jbe	LABEL(exit_use)
790#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
791	sub	$16, %r11
792	jbe	LABEL(strcmp_exitz)
793#endif
794	add	$16, %rdx
795	jmp	LABEL(loop_ashr_5_use)
796
797	.p2align 4
798LABEL(nibble_ashr_5_use):
799	sub	$0x1000, %r10
800	movdqa	-16(%rdi, %rdx), %xmm0
801	psrldq	$5, D(%xmm0)
802	pcmpistri      $0x3a,%xmm0, %xmm0
803#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
804	cmp	%r11, %rcx
805	jae	LABEL(nibble_ashr_exit_use)
806#endif
807	cmp	$10, %ecx
808	ja	LABEL(nibble_ashr_5_restart_use)
809
810	jmp	LABEL(nibble_ashr_exit_use)
811
812/*
813 * The following cases will be handled by ashr_6
814 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
815 *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
816 */
817	.p2align 4
818LABEL(ashr_6):
819	pslldq	$10, D(%xmm2)
820	TOLOWER (%xmm1, %xmm2)
821	pcmpeqb	%xmm1, D(%xmm2)
822	psubb	%xmm0, D(%xmm2)
823	pmovmskb %xmm2, %r9d
824	shr	%cl, %edx
825	shr	%cl, %r9d
826	sub	%r9d, %edx
827	jnz	LABEL(less32bytes)
828	movdqa	(%rdi), %xmm3
829
830	UPDATE_STRNCMP_COUNTER
831
832	mov	$16, %rcx	/* index for loads */
833	mov	$6, %r9d	/* byte position left over from less32bytes case */
834	/*
835	 * Setup %r10 value allows us to detect crossing a page boundary.
836	 * When %r10 goes positive we have crossed a page boundary and
837	 * need to do a nibble.
838	 */
839	lea	6(%rdi), %r10
840	and	$0xfff, %r10	/* offset into 4K page */
841	sub	$0x1000, %r10	/* subtract 4K pagesize */
842	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
843
844	.p2align 4
845LABEL(loop_ashr_6_use):
846	add	$16, %r10
847	jg	LABEL(nibble_ashr_6_use)
848
849LABEL(nibble_ashr_6_restart_use):
850	movdqa	(%rdi, %rdx), %xmm0
851	palignr $6, -16(%rdi, %rdx), D(%xmm0)
852#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
853	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
854#else
855	movdqa	(%rsi,%rdx), %xmm1
856	TOLOWER (%xmm0, %xmm1)
857	pcmpistri $0x1a, %xmm1, %xmm0
858#endif
859	jbe	LABEL(exit_use)
860#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
861	sub	$16, %r11
862	jbe	LABEL(strcmp_exitz)
863#endif
864
865	add	$16, %rdx
866	add	$16, %r10
867	jg	LABEL(nibble_ashr_6_use)
868
869	movdqa	(%rdi, %rdx), %xmm0
870	palignr $6, -16(%rdi, %rdx), D(%xmm0)
871#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
872	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
873#else
874	movdqa	(%rsi,%rdx), %xmm1
875	TOLOWER (%xmm0, %xmm1)
876	pcmpistri $0x1a, %xmm1, %xmm0
877#endif
878	jbe	LABEL(exit_use)
879#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
880	sub	$16, %r11
881	jbe	LABEL(strcmp_exitz)
882#endif
883	add	$16, %rdx
884	jmp	LABEL(loop_ashr_6_use)
885
886	.p2align 4
887LABEL(nibble_ashr_6_use):
888	sub	$0x1000, %r10
889	movdqa	-16(%rdi, %rdx), %xmm0
890	psrldq	$6, D(%xmm0)
891	pcmpistri      $0x3a,%xmm0, %xmm0
892#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
893	cmp	%r11, %rcx
894	jae	LABEL(nibble_ashr_exit_use)
895#endif
896	cmp	$9, %ecx
897	ja	LABEL(nibble_ashr_6_restart_use)
898
899	jmp	LABEL(nibble_ashr_exit_use)
900
901/*
902 * The following cases will be handled by ashr_7
903 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
904 *        n(9~15)          n - 9		  6(15 +(n - 9) - n)         ashr_7
905 */
906	.p2align 4
907LABEL(ashr_7):
908	pslldq	$9, D(%xmm2)
909	TOLOWER (%xmm1, %xmm2)
910	pcmpeqb	%xmm1, D(%xmm2)
911	psubb	%xmm0, D(%xmm2)
912	pmovmskb %xmm2, %r9d
913	shr	%cl, %edx
914	shr	%cl, %r9d
915	sub	%r9d, %edx
916	jnz	LABEL(less32bytes)
917	movdqa	(%rdi), %xmm3
918
919	UPDATE_STRNCMP_COUNTER
920
921	mov	$16, %rcx	/* index for loads */
922	mov	$7, %r9d	/* byte position left over from less32bytes case */
923	/*
924	 * Setup %r10 value allows us to detect crossing a page boundary.
925	 * When %r10 goes positive we have crossed a page boundary and
926	 * need to do a nibble.
927	 */
928	lea	7(%rdi), %r10
929	and	$0xfff, %r10	/* offset into 4K page */
930	sub	$0x1000, %r10	/* subtract 4K pagesize */
931	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
932
933	.p2align 4
934LABEL(loop_ashr_7_use):
935	add	$16, %r10
936	jg	LABEL(nibble_ashr_7_use)
937
938LABEL(nibble_ashr_7_restart_use):
939	movdqa	(%rdi, %rdx), %xmm0
940	palignr $7, -16(%rdi, %rdx), D(%xmm0)
941#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
942	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
943#else
944	movdqa	(%rsi,%rdx), %xmm1
945	TOLOWER (%xmm0, %xmm1)
946	pcmpistri $0x1a, %xmm1, %xmm0
947#endif
948	jbe	LABEL(exit_use)
949#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
950	sub	$16, %r11
951	jbe	LABEL(strcmp_exitz)
952#endif
953
954	add	$16, %rdx
955	add	$16, %r10
956	jg	LABEL(nibble_ashr_7_use)
957
958	movdqa	(%rdi, %rdx), %xmm0
959	palignr $7, -16(%rdi, %rdx), D(%xmm0)
960#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
961	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
962#else
963	movdqa	(%rsi,%rdx), %xmm1
964	TOLOWER (%xmm0, %xmm1)
965	pcmpistri $0x1a, %xmm1, %xmm0
966#endif
967	jbe	LABEL(exit_use)
968#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
969	sub	$16, %r11
970	jbe	LABEL(strcmp_exitz)
971#endif
972	add	$16, %rdx
973	jmp	LABEL(loop_ashr_7_use)
974
975	.p2align 4
976LABEL(nibble_ashr_7_use):
977	sub	$0x1000, %r10
978	movdqa	-16(%rdi, %rdx), %xmm0
979	psrldq	$7, D(%xmm0)
980	pcmpistri      $0x3a,%xmm0, %xmm0
981#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
982	cmp	%r11, %rcx
983	jae	LABEL(nibble_ashr_exit_use)
984#endif
985	cmp	$8, %ecx
986	ja	LABEL(nibble_ashr_7_restart_use)
987
988	jmp	LABEL(nibble_ashr_exit_use)
989
990/*
991 *  The following cases will be handled by ashr_8
992 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
993 *        n(8~15)          n - 8		  7(15 +(n - 8) - n)         ashr_8
994 */
995	.p2align 4
996LABEL(ashr_8):
997	pslldq	$8, D(%xmm2)
998	TOLOWER (%xmm1, %xmm2)
999	pcmpeqb	%xmm1, D(%xmm2)
1000	psubb	%xmm0, D(%xmm2)
1001	pmovmskb %xmm2, %r9d
1002	shr	%cl, %edx
1003	shr	%cl, %r9d
1004	sub	%r9d, %edx
1005	jnz	LABEL(less32bytes)
1006	movdqa	(%rdi), %xmm3
1007
1008	UPDATE_STRNCMP_COUNTER
1009
1010	mov	$16, %rcx	/* index for loads */
1011	mov	$8, %r9d	/* byte position left over from less32bytes case */
1012	/*
1013	 * Setup %r10 value allows us to detect crossing a page boundary.
1014	 * When %r10 goes positive we have crossed a page boundary and
1015	 * need to do a nibble.
1016	 */
1017	lea	8(%rdi), %r10
1018	and	$0xfff, %r10	/* offset into 4K page */
1019	sub	$0x1000, %r10	/* subtract 4K pagesize */
1020	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1021
1022	.p2align 4
1023LABEL(loop_ashr_8_use):
1024	add	$16, %r10
1025	jg	LABEL(nibble_ashr_8_use)
1026
1027LABEL(nibble_ashr_8_restart_use):
1028	movdqa	(%rdi, %rdx), %xmm0
1029	palignr $8, -16(%rdi, %rdx), D(%xmm0)
1030#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1031	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1032#else
1033	movdqa	(%rsi,%rdx), %xmm1
1034	TOLOWER (%xmm0, %xmm1)
1035	pcmpistri $0x1a, %xmm1, %xmm0
1036#endif
1037	jbe	LABEL(exit_use)
1038#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1039	sub	$16, %r11
1040	jbe	LABEL(strcmp_exitz)
1041#endif
1042
1043	add	$16, %rdx
1044	add	$16, %r10
1045	jg	LABEL(nibble_ashr_8_use)
1046
1047	movdqa	(%rdi, %rdx), %xmm0
1048	palignr $8, -16(%rdi, %rdx), D(%xmm0)
1049#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1050	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1051#else
1052	movdqa	(%rsi,%rdx), %xmm1
1053	TOLOWER (%xmm0, %xmm1)
1054	pcmpistri $0x1a, %xmm1, %xmm0
1055#endif
1056	jbe	LABEL(exit_use)
1057#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1058	sub	$16, %r11
1059	jbe	LABEL(strcmp_exitz)
1060#endif
1061	add	$16, %rdx
1062	jmp	LABEL(loop_ashr_8_use)
1063
1064	.p2align 4
1065LABEL(nibble_ashr_8_use):
1066	sub	$0x1000, %r10
1067	movdqa	-16(%rdi, %rdx), %xmm0
1068	psrldq	$8, D(%xmm0)
1069	pcmpistri      $0x3a,%xmm0, %xmm0
1070#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1071	cmp	%r11, %rcx
1072	jae	LABEL(nibble_ashr_exit_use)
1073#endif
1074	cmp	$7, %ecx
1075	ja	LABEL(nibble_ashr_8_restart_use)
1076
1077	jmp	LABEL(nibble_ashr_exit_use)
1078
1079/*
1080 *  The following cases will be handled by ashr_9
1081 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1082 *        n(7~15)          n - 7		  8(15 +(n - 7) - n)         ashr_9
1083 */
1084	.p2align 4
1085LABEL(ashr_9):
1086	pslldq	$7, D(%xmm2)
1087	TOLOWER (%xmm1, %xmm2)
1088	pcmpeqb	%xmm1, D(%xmm2)
1089	psubb	%xmm0, D(%xmm2)
1090	pmovmskb %xmm2, %r9d
1091	shr	%cl, %edx
1092	shr	%cl, %r9d
1093	sub	%r9d, %edx
1094	jnz	LABEL(less32bytes)
1095	movdqa	(%rdi), %xmm3
1096
1097	UPDATE_STRNCMP_COUNTER
1098
1099	mov	$16, %rcx	/* index for loads */
1100	mov	$9, %r9d	/* byte position left over from less32bytes case */
1101	/*
1102	 * Setup %r10 value allows us to detect crossing a page boundary.
1103	 * When %r10 goes positive we have crossed a page boundary and
1104	 * need to do a nibble.
1105	 */
1106	lea	9(%rdi), %r10
1107	and	$0xfff, %r10	/* offset into 4K page */
1108	sub	$0x1000, %r10	/* subtract 4K pagesize */
1109	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1110
1111	.p2align 4
1112LABEL(loop_ashr_9_use):
1113	add	$16, %r10
1114	jg	LABEL(nibble_ashr_9_use)
1115
1116LABEL(nibble_ashr_9_restart_use):
1117	movdqa	(%rdi, %rdx), %xmm0
1118
1119	palignr $9, -16(%rdi, %rdx), D(%xmm0)
1120#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1121	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1122#else
1123	movdqa	(%rsi,%rdx), %xmm1
1124	TOLOWER (%xmm0, %xmm1)
1125	pcmpistri $0x1a, %xmm1, %xmm0
1126#endif
1127	jbe	LABEL(exit_use)
1128#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1129	sub	$16, %r11
1130	jbe	LABEL(strcmp_exitz)
1131#endif
1132
1133	add	$16, %rdx
1134	add	$16, %r10
1135	jg	LABEL(nibble_ashr_9_use)
1136
1137	movdqa	(%rdi, %rdx), %xmm0
1138	palignr $9, -16(%rdi, %rdx), D(%xmm0)
1139#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1140	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1141#else
1142	movdqa	(%rsi,%rdx), %xmm1
1143	TOLOWER (%xmm0, %xmm1)
1144	pcmpistri $0x1a, %xmm1, %xmm0
1145#endif
1146	jbe	LABEL(exit_use)
1147#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1148	sub	$16, %r11
1149	jbe	LABEL(strcmp_exitz)
1150#endif
1151	add	$16, %rdx
1152	jmp	LABEL(loop_ashr_9_use)
1153
1154	.p2align 4
1155LABEL(nibble_ashr_9_use):
1156	sub	$0x1000, %r10
1157	movdqa	-16(%rdi, %rdx), %xmm0
1158	psrldq	$9, D(%xmm0)
1159	pcmpistri      $0x3a,%xmm0, %xmm0
1160#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1161	cmp	%r11, %rcx
1162	jae	LABEL(nibble_ashr_exit_use)
1163#endif
1164	cmp	$6, %ecx
1165	ja	LABEL(nibble_ashr_9_restart_use)
1166
1167	jmp	LABEL(nibble_ashr_exit_use)
1168
1169/*
1170 *  The following cases will be handled by ashr_10
1171 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1172 *        n(6~15)          n - 6		  9(15 +(n - 6) - n)         ashr_10
1173 */
1174	.p2align 4
1175LABEL(ashr_10):
1176	pslldq	$6, D(%xmm2)
1177	TOLOWER (%xmm1, %xmm2)
1178	pcmpeqb	%xmm1, D(%xmm2)
1179	psubb	%xmm0, D(%xmm2)
1180	pmovmskb %xmm2, %r9d
1181	shr	%cl, %edx
1182	shr	%cl, %r9d
1183	sub	%r9d, %edx
1184	jnz	LABEL(less32bytes)
1185	movdqa	(%rdi), %xmm3
1186
1187	UPDATE_STRNCMP_COUNTER
1188
1189	mov	$16, %rcx	/* index for loads */
1190	mov	$10, %r9d	/* byte position left over from less32bytes case */
1191	/*
1192	 * Setup %r10 value allows us to detect crossing a page boundary.
1193	 * When %r10 goes positive we have crossed a page boundary and
1194	 * need to do a nibble.
1195	 */
1196	lea	10(%rdi), %r10
1197	and	$0xfff, %r10	/* offset into 4K page */
1198	sub	$0x1000, %r10	/* subtract 4K pagesize */
1199	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1200
1201	.p2align 4
1202LABEL(loop_ashr_10_use):
1203	add	$16, %r10
1204	jg	LABEL(nibble_ashr_10_use)
1205
1206LABEL(nibble_ashr_10_restart_use):
1207	movdqa	(%rdi, %rdx), %xmm0
1208	palignr $10, -16(%rdi, %rdx), D(%xmm0)
1209#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1210	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1211#else
1212	movdqa	(%rsi,%rdx), %xmm1
1213	TOLOWER (%xmm0, %xmm1)
1214	pcmpistri $0x1a, %xmm1, %xmm0
1215#endif
1216	jbe	LABEL(exit_use)
1217#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1218	sub	$16, %r11
1219	jbe	LABEL(strcmp_exitz)
1220#endif
1221
1222	add	$16, %rdx
1223	add	$16, %r10
1224	jg	LABEL(nibble_ashr_10_use)
1225
1226	movdqa	(%rdi, %rdx), %xmm0
1227	palignr $10, -16(%rdi, %rdx), D(%xmm0)
1228#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1229	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1230#else
1231	movdqa	(%rsi,%rdx), %xmm1
1232	TOLOWER (%xmm0, %xmm1)
1233	pcmpistri $0x1a, %xmm1, %xmm0
1234#endif
1235	jbe	LABEL(exit_use)
1236#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1237	sub	$16, %r11
1238	jbe	LABEL(strcmp_exitz)
1239#endif
1240	add	$16, %rdx
1241	jmp	LABEL(loop_ashr_10_use)
1242
1243	.p2align 4
1244LABEL(nibble_ashr_10_use):
1245	sub	$0x1000, %r10
1246	movdqa	-16(%rdi, %rdx), %xmm0
1247	psrldq	$10, D(%xmm0)
1248	pcmpistri      $0x3a,%xmm0, %xmm0
1249#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1250	cmp	%r11, %rcx
1251	jae	LABEL(nibble_ashr_exit_use)
1252#endif
1253	cmp	$5, %ecx
1254	ja	LABEL(nibble_ashr_10_restart_use)
1255
1256	jmp	LABEL(nibble_ashr_exit_use)
1257
1258/*
1259 *  The following cases will be handled by ashr_11
1260 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1261 *        n(5~15)          n - 5		  10(15 +(n - 5) - n)         ashr_11
1262 */
1263	.p2align 4
1264LABEL(ashr_11):
1265	pslldq	$5, D(%xmm2)
1266	TOLOWER (%xmm1, %xmm2)
1267	pcmpeqb	%xmm1, D(%xmm2)
1268	psubb	%xmm0, D(%xmm2)
1269	pmovmskb %xmm2, %r9d
1270	shr	%cl, %edx
1271	shr	%cl, %r9d
1272	sub	%r9d, %edx
1273	jnz	LABEL(less32bytes)
1274	movdqa	(%rdi), %xmm3
1275
1276	UPDATE_STRNCMP_COUNTER
1277
1278	mov	$16, %rcx	/* index for loads */
1279	mov	$11, %r9d	/* byte position left over from less32bytes case */
1280	/*
1281	 * Setup %r10 value allows us to detect crossing a page boundary.
1282	 * When %r10 goes positive we have crossed a page boundary and
1283	 * need to do a nibble.
1284	 */
1285	lea	11(%rdi), %r10
1286	and	$0xfff, %r10	/* offset into 4K page */
1287	sub	$0x1000, %r10	/* subtract 4K pagesize */
1288	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1289
1290	.p2align 4
1291LABEL(loop_ashr_11_use):
1292	add	$16, %r10
1293	jg	LABEL(nibble_ashr_11_use)
1294
1295LABEL(nibble_ashr_11_restart_use):
1296	movdqa	(%rdi, %rdx), %xmm0
1297	palignr $11, -16(%rdi, %rdx), D(%xmm0)
1298#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1299	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1300#else
1301	movdqa	(%rsi,%rdx), %xmm1
1302	TOLOWER (%xmm0, %xmm1)
1303	pcmpistri $0x1a, %xmm1, %xmm0
1304#endif
1305	jbe	LABEL(exit_use)
1306#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1307	sub	$16, %r11
1308	jbe	LABEL(strcmp_exitz)
1309#endif
1310
1311	add	$16, %rdx
1312	add	$16, %r10
1313	jg	LABEL(nibble_ashr_11_use)
1314
1315	movdqa	(%rdi, %rdx), %xmm0
1316	palignr $11, -16(%rdi, %rdx), D(%xmm0)
1317#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1318	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1319#else
1320	movdqa	(%rsi,%rdx), %xmm1
1321	TOLOWER (%xmm0, %xmm1)
1322	pcmpistri $0x1a, %xmm1, %xmm0
1323#endif
1324	jbe	LABEL(exit_use)
1325#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1326	sub	$16, %r11
1327	jbe	LABEL(strcmp_exitz)
1328#endif
1329	add	$16, %rdx
1330	jmp	LABEL(loop_ashr_11_use)
1331
1332	.p2align 4
1333LABEL(nibble_ashr_11_use):
1334	sub	$0x1000, %r10
1335	movdqa	-16(%rdi, %rdx), %xmm0
1336	psrldq	$11, D(%xmm0)
1337	pcmpistri      $0x3a,%xmm0, %xmm0
1338#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1339	cmp	%r11, %rcx
1340	jae	LABEL(nibble_ashr_exit_use)
1341#endif
1342	cmp	$4, %ecx
1343	ja	LABEL(nibble_ashr_11_restart_use)
1344
1345	jmp	LABEL(nibble_ashr_exit_use)
1346
1347/*
1348 *  The following cases will be handled by ashr_12
1349 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1350 *        n(4~15)          n - 4		  11(15 +(n - 4) - n)         ashr_12
1351 */
1352	.p2align 4
1353LABEL(ashr_12):
1354	pslldq	$4, D(%xmm2)
1355	TOLOWER (%xmm1, %xmm2)
1356	pcmpeqb	%xmm1, D(%xmm2)
1357	psubb	%xmm0, D(%xmm2)
1358	pmovmskb %xmm2, %r9d
1359	shr	%cl, %edx
1360	shr	%cl, %r9d
1361	sub	%r9d, %edx
1362	jnz	LABEL(less32bytes)
1363	movdqa	(%rdi), %xmm3
1364
1365	UPDATE_STRNCMP_COUNTER
1366
1367	mov	$16, %rcx	/* index for loads */
1368	mov	$12, %r9d	/* byte position left over from less32bytes case */
1369	/*
1370	 * Setup %r10 value allows us to detect crossing a page boundary.
1371	 * When %r10 goes positive we have crossed a page boundary and
1372	 * need to do a nibble.
1373	 */
1374	lea	12(%rdi), %r10
1375	and	$0xfff, %r10	/* offset into 4K page */
1376	sub	$0x1000, %r10	/* subtract 4K pagesize */
1377	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1378
1379	.p2align 4
1380LABEL(loop_ashr_12_use):
1381	add	$16, %r10
1382	jg	LABEL(nibble_ashr_12_use)
1383
1384LABEL(nibble_ashr_12_restart_use):
1385	movdqa	(%rdi, %rdx), %xmm0
1386	palignr $12, -16(%rdi, %rdx), D(%xmm0)
1387#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1388	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1389#else
1390	movdqa	(%rsi,%rdx), %xmm1
1391	TOLOWER (%xmm0, %xmm1)
1392	pcmpistri $0x1a, %xmm1, %xmm0
1393#endif
1394	jbe	LABEL(exit_use)
1395#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1396	sub	$16, %r11
1397	jbe	LABEL(strcmp_exitz)
1398#endif
1399
1400	add	$16, %rdx
1401	add	$16, %r10
1402	jg	LABEL(nibble_ashr_12_use)
1403
1404	movdqa	(%rdi, %rdx), %xmm0
1405	palignr $12, -16(%rdi, %rdx), D(%xmm0)
1406#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1407	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1408#else
1409	movdqa	(%rsi,%rdx), %xmm1
1410	TOLOWER (%xmm0, %xmm1)
1411	pcmpistri $0x1a, %xmm1, %xmm0
1412#endif
1413	jbe	LABEL(exit_use)
1414#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1415	sub	$16, %r11
1416	jbe	LABEL(strcmp_exitz)
1417#endif
1418	add	$16, %rdx
1419	jmp	LABEL(loop_ashr_12_use)
1420
1421	.p2align 4
1422LABEL(nibble_ashr_12_use):
1423	sub	$0x1000, %r10
1424	movdqa	-16(%rdi, %rdx), %xmm0
1425	psrldq	$12, D(%xmm0)
1426	pcmpistri      $0x3a,%xmm0, %xmm0
1427#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1428	cmp	%r11, %rcx
1429	jae	LABEL(nibble_ashr_exit_use)
1430#endif
1431	cmp	$3, %ecx
1432	ja	LABEL(nibble_ashr_12_restart_use)
1433
1434	jmp	LABEL(nibble_ashr_exit_use)
1435
1436/*
1437 *  The following cases will be handled by ashr_13
1438 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1439 *        n(3~15)          n - 3		  12(15 +(n - 3) - n)         ashr_13
1440 */
1441	.p2align 4
1442LABEL(ashr_13):
1443	pslldq	$3, D(%xmm2)
1444	TOLOWER (%xmm1, %xmm2)
1445	pcmpeqb	%xmm1, D(%xmm2)
1446	psubb	%xmm0, D(%xmm2)
1447	pmovmskb %xmm2, %r9d
1448	shr	%cl, %edx
1449	shr	%cl, %r9d
1450	sub	%r9d, %edx
1451	jnz	LABEL(less32bytes)
1452	movdqa	(%rdi), %xmm3
1453
1454	UPDATE_STRNCMP_COUNTER
1455
1456	mov	$16, %rcx	/* index for loads */
1457	mov	$13, %r9d	/* byte position left over from less32bytes case */
1458	/*
1459	 * Setup %r10 value allows us to detect crossing a page boundary.
1460	 * When %r10 goes positive we have crossed a page boundary and
1461	 * need to do a nibble.
1462	 */
1463	lea	13(%rdi), %r10
1464	and	$0xfff, %r10	/* offset into 4K page */
1465	sub	$0x1000, %r10	/* subtract 4K pagesize */
1466
1467	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1468
1469	.p2align 4
1470LABEL(loop_ashr_13_use):
1471	add	$16, %r10
1472	jg	LABEL(nibble_ashr_13_use)
1473
1474LABEL(nibble_ashr_13_restart_use):
1475	movdqa	(%rdi, %rdx), %xmm0
1476	palignr $13, -16(%rdi, %rdx), D(%xmm0)
1477#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1478	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1479#else
1480	movdqa	(%rsi,%rdx), %xmm1
1481	TOLOWER (%xmm0, %xmm1)
1482	pcmpistri $0x1a, %xmm1, %xmm0
1483#endif
1484	jbe	LABEL(exit_use)
1485#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1486	sub	$16, %r11
1487	jbe	LABEL(strcmp_exitz)
1488#endif
1489
1490	add	$16, %rdx
1491	add	$16, %r10
1492	jg	LABEL(nibble_ashr_13_use)
1493
1494	movdqa	(%rdi, %rdx), %xmm0
1495	palignr $13, -16(%rdi, %rdx), D(%xmm0)
1496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1497	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1498#else
1499	movdqa	(%rsi,%rdx), %xmm1
1500	TOLOWER (%xmm0, %xmm1)
1501	pcmpistri $0x1a, %xmm1, %xmm0
1502#endif
1503	jbe	LABEL(exit_use)
1504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1505	sub	$16, %r11
1506	jbe	LABEL(strcmp_exitz)
1507#endif
1508	add	$16, %rdx
1509	jmp	LABEL(loop_ashr_13_use)
1510
1511	.p2align 4
1512LABEL(nibble_ashr_13_use):
1513	sub	$0x1000, %r10
1514	movdqa	-16(%rdi, %rdx), %xmm0
1515	psrldq	$13, D(%xmm0)
1516	pcmpistri      $0x3a,%xmm0, %xmm0
1517#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1518	cmp	%r11, %rcx
1519	jae	LABEL(nibble_ashr_exit_use)
1520#endif
1521	cmp	$2, %ecx
1522	ja	LABEL(nibble_ashr_13_restart_use)
1523
1524	jmp	LABEL(nibble_ashr_exit_use)
1525
1526/*
1527 *  The following cases will be handled by ashr_14
1528 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1529 *        n(2~15)          n - 2		  13(15 +(n - 2) - n)         ashr_14
1530 */
1531	.p2align 4
1532LABEL(ashr_14):
1533	pslldq  $2, D(%xmm2)
1534	TOLOWER (%xmm1, %xmm2)
1535	pcmpeqb	%xmm1, D(%xmm2)
1536	psubb	%xmm0, D(%xmm2)
1537	pmovmskb %xmm2, %r9d
1538	shr	%cl, %edx
1539	shr	%cl, %r9d
1540	sub	%r9d, %edx
1541	jnz	LABEL(less32bytes)
1542	movdqa	(%rdi), %xmm3
1543
1544	UPDATE_STRNCMP_COUNTER
1545
1546	mov	$16, %rcx	/* index for loads */
1547	mov	$14, %r9d	/* byte position left over from less32bytes case */
1548	/*
1549	 * Setup %r10 value allows us to detect crossing a page boundary.
1550	 * When %r10 goes positive we have crossed a page boundary and
1551	 * need to do a nibble.
1552	 */
1553	lea	14(%rdi), %r10
1554	and	$0xfff, %r10	/* offset into 4K page */
1555	sub	$0x1000, %r10	/* subtract 4K pagesize */
1556
1557	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1558
1559	.p2align 4
1560LABEL(loop_ashr_14_use):
1561	add	$16, %r10
1562	jg	LABEL(nibble_ashr_14_use)
1563
1564LABEL(nibble_ashr_14_restart_use):
1565	movdqa	(%rdi, %rdx), %xmm0
1566	palignr $14, -16(%rdi, %rdx), D(%xmm0)
1567#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1568	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1569#else
1570	movdqa	(%rsi,%rdx), %xmm1
1571	TOLOWER (%xmm0, %xmm1)
1572	pcmpistri $0x1a, %xmm1, %xmm0
1573#endif
1574	jbe	LABEL(exit_use)
1575#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1576	sub	$16, %r11
1577	jbe	LABEL(strcmp_exitz)
1578#endif
1579
1580	add	$16, %rdx
1581	add	$16, %r10
1582	jg	LABEL(nibble_ashr_14_use)
1583
1584	movdqa	(%rdi, %rdx), %xmm0
1585	palignr $14, -16(%rdi, %rdx), D(%xmm0)
1586#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1587	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1588#else
1589	movdqa	(%rsi,%rdx), %xmm1
1590	TOLOWER (%xmm0, %xmm1)
1591	pcmpistri $0x1a, %xmm1, %xmm0
1592#endif
1593	jbe	LABEL(exit_use)
1594#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1595	sub	$16, %r11
1596	jbe	LABEL(strcmp_exitz)
1597#endif
1598	add	$16, %rdx
1599	jmp	LABEL(loop_ashr_14_use)
1600
1601	.p2align 4
1602LABEL(nibble_ashr_14_use):
1603	sub	$0x1000, %r10
1604	movdqa	-16(%rdi, %rdx), %xmm0
1605	psrldq	$14, D(%xmm0)
1606	pcmpistri      $0x3a,%xmm0, %xmm0
1607#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1608	cmp	%r11, %rcx
1609	jae	LABEL(nibble_ashr_exit_use)
1610#endif
1611	cmp	$1, %ecx
1612	ja	LABEL(nibble_ashr_14_restart_use)
1613
1614	jmp	LABEL(nibble_ashr_exit_use)
1615
1616/*
1617 *  The following cases will be handled by ashr_15
1618 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1619 *        n(1~15)          n - 1		  14(15 +(n - 1) - n)         ashr_15
1620 */
1621	.p2align 4
1622LABEL(ashr_15):
1623	pslldq	$1, D(%xmm2)
1624	TOLOWER (%xmm1, %xmm2)
1625	pcmpeqb	%xmm1, D(%xmm2)
1626	psubb	%xmm0, D(%xmm2)
1627	pmovmskb %xmm2, %r9d
1628	shr	%cl, %edx
1629	shr	%cl, %r9d
1630	sub	%r9d, %edx
1631	jnz	LABEL(less32bytes)
1632
1633	movdqa	(%rdi), %xmm3
1634
1635	UPDATE_STRNCMP_COUNTER
1636
1637	mov	$16, %rcx	/* index for loads */
1638	mov	$15, %r9d	/* byte position left over from less32bytes case */
1639	/*
1640	 * Setup %r10 value allows us to detect crossing a page boundary.
1641	 * When %r10 goes positive we have crossed a page boundary and
1642	 * need to do a nibble.
1643	 */
1644	lea	15(%rdi), %r10
1645	and	$0xfff, %r10	/* offset into 4K page */
1646
1647	sub	$0x1000, %r10	/* subtract 4K pagesize */
1648
1649	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
1650
1651	.p2align 4
1652LABEL(loop_ashr_15_use):
1653	add	$16, %r10
1654	jg	LABEL(nibble_ashr_15_use)
1655
1656LABEL(nibble_ashr_15_restart_use):
1657	movdqa	(%rdi, %rdx), %xmm0
1658	palignr $15, -16(%rdi, %rdx), D(%xmm0)
1659#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1660	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1661#else
1662	movdqa	(%rsi,%rdx), %xmm1
1663	TOLOWER (%xmm0, %xmm1)
1664	pcmpistri $0x1a, %xmm1, %xmm0
1665#endif
1666	jbe	LABEL(exit_use)
1667#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1668	sub	$16, %r11
1669	jbe	LABEL(strcmp_exitz)
1670#endif
1671
1672	add	$16, %rdx
1673	add	$16, %r10
1674	jg	LABEL(nibble_ashr_15_use)
1675
1676	movdqa	(%rdi, %rdx), %xmm0
1677	palignr $15, -16(%rdi, %rdx), D(%xmm0)
1678#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1679	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1680#else
1681	movdqa	(%rsi,%rdx), %xmm1
1682	TOLOWER (%xmm0, %xmm1)
1683	pcmpistri $0x1a, %xmm1, %xmm0
1684#endif
1685	jbe	LABEL(exit_use)
1686#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1687	sub	$16, %r11
1688	jbe	LABEL(strcmp_exitz)
1689#endif
1690	add	$16, %rdx
1691	jmp	LABEL(loop_ashr_15_use)
1692
1693	.p2align 4
1694LABEL(nibble_ashr_15_use):
1695	sub	$0x1000, %r10
1696	movdqa	-16(%rdi, %rdx), %xmm0
1697	psrldq	$15, D(%xmm0)
1698	pcmpistri      $0x3a,%xmm0, %xmm0
1699#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1700	cmp	%r11, %rcx
1701	jae	LABEL(nibble_ashr_exit_use)
1702#endif
1703	cmp	$0, %ecx
1704	ja	LABEL(nibble_ashr_15_restart_use)
1705
1706LABEL(nibble_ashr_exit_use):
1707#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1708	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
1709#else
1710	movdqa	(%rsi,%rdx), %xmm1
1711	TOLOWER (%xmm0, %xmm1)
1712	pcmpistri $0x1a, %xmm1, %xmm0
1713#endif
1714	.p2align 4
1715LABEL(exit_use):
1716	jnc	LABEL(strcmp_exitz)
1717#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1718	sub	%rcx, %r11
1719	jbe	LABEL(strcmp_exitz)
1720#endif
1721	add	%rcx, %rdx
1722	lea	-16(%rdi, %r9), %rdi
1723	movzbl	(%rdi, %rdx), %eax
1724	movzbl	(%rsi, %rdx), %edx
1725	test	%r8d, %r8d
1726	jz	LABEL(ret_use)
1727	xchg	%eax, %edx
1728LABEL(ret_use):
1729#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1730	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1731	movl	(%rcx,%rdx,4), %edx
1732	movl	(%rcx,%rax,4), %eax
1733#endif
1734
1735	sub	%edx, %eax
1736	ret
1737
1738LABEL(less32bytes):
1739	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1740	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1741	test	%r8d, %r8d
1742	jz	LABEL(ret)
1743	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1744
1745	.p2align 4
1746LABEL(ret):
1747LABEL(less16bytes):
1748	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1749
1750#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1751	sub	%rdx, %r11
1752	jbe	LABEL(strcmp_exitz)
1753#endif
1754	movzbl	(%rsi, %rdx), %ecx
1755	movzbl	(%rdi, %rdx), %eax
1756
1757#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1758	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1759	movl	(%rdx,%rcx,4), %ecx
1760	movl	(%rdx,%rax,4), %eax
1761#endif
1762
1763	sub	%ecx, %eax
1764	ret
1765
1766LABEL(strcmp_exitz):
1767	xor	%eax, %eax
1768	ret
1769
1770	.p2align 4
1771	// XXX Same as code above
1772LABEL(Byte0):
1773	movzbl	(%rsi), %ecx
1774	movzbl	(%rdi), %eax
1775
1776#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1777	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1778	movl	(%rdx,%rcx,4), %ecx
1779	movl	(%rdx,%rax,4), %eax
1780#endif
1781
1782	sub	%ecx, %eax
1783	ret
1784	cfi_endproc
1785	.size	STRCMP_SSE42, .-STRCMP_SSE42
1786
1787#undef UCLOW_reg
1788#undef UCHIGH_reg
1789#undef LCQWORD_reg
1790#undef TOLOWER
1791
1792	/* Put all SSE 4.2 functions together.  */
1793	.section .rodata.SECTION,"a",@progbits
1794	.p2align 3
1795LABEL(unaligned_table):
1796	.int	LABEL(ashr_1) - LABEL(unaligned_table)
1797	.int	LABEL(ashr_2) - LABEL(unaligned_table)
1798	.int	LABEL(ashr_3) - LABEL(unaligned_table)
1799	.int	LABEL(ashr_4) - LABEL(unaligned_table)
1800	.int	LABEL(ashr_5) - LABEL(unaligned_table)
1801	.int	LABEL(ashr_6) - LABEL(unaligned_table)
1802	.int	LABEL(ashr_7) - LABEL(unaligned_table)
1803	.int	LABEL(ashr_8) - LABEL(unaligned_table)
1804	.int	LABEL(ashr_9) - LABEL(unaligned_table)
1805	.int	LABEL(ashr_10) - LABEL(unaligned_table)
1806	.int	LABEL(ashr_11) - LABEL(unaligned_table)
1807	.int	LABEL(ashr_12) - LABEL(unaligned_table)
1808	.int	LABEL(ashr_13) - LABEL(unaligned_table)
1809	.int	LABEL(ashr_14) - LABEL(unaligned_table)
1810	.int	LABEL(ashr_15) - LABEL(unaligned_table)
1811	.int	LABEL(ashr_0) - LABEL(unaligned_table)
1812
1813#undef LABEL
1814#undef GLABEL
1815#undef SECTION
1816#undef movdqa
1817#undef movdqu
1818#undef pmovmskb
1819#undef pcmpistri
1820#undef psubb
1821#undef pcmpeqb
1822#undef psrldq
1823#undef pslldq
1824#undef palignr
1825#undef pxor
1826#undef D
1827