1/* Highly optimized version for x86-64.
2   Copyright (C) 1999-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "asm-syntax.h"
21
22#undef UPDATE_STRNCMP_COUNTER
23
24#ifndef LABEL
25#define LABEL(l) L(l)
26#endif
27
28#ifdef USE_AS_STRNCMP
29/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
30   if the new counter > the old one or is 0.  */
31# define UPDATE_STRNCMP_COUNTER				\
32	/* calculate left number to compare */		\
33	lea	-16(%rcx, %r11), %r9;			\
34	cmp	%r9, %r11;				\
35	jb	LABEL(strcmp_exitz);			\
36	test	%r9, %r9;				\
37	je	LABEL(strcmp_exitz);			\
38	mov	%r9, %r11
39
40#elif defined USE_AS_STRCASECMP_L
41# include "locale-defines.h"
42
43# define UPDATE_STRNCMP_COUNTER
44#elif defined USE_AS_STRNCASECMP_L
45# include "locale-defines.h"
46
47# define UPDATE_STRNCMP_COUNTER				\
48	/* calculate left number to compare */		\
49	lea	-16(%rcx, %r11), %r9;			\
50	cmp	%r9, %r11;				\
51	jb	LABEL(strcmp_exitz);			\
52	test	%r9, %r9;				\
53	je	LABEL(strcmp_exitz);			\
54	mov	%r9, %r11
55#else
56# define UPDATE_STRNCMP_COUNTER
57# ifndef STRCMP
58#  define STRCMP strcmp
59# endif
60#endif
61
62#ifndef USE_SSSE3
63	.text
64#else
65	.section .text.ssse3,"ax",@progbits
66#endif
67
68#ifdef USE_AS_STRCASECMP_L
69# ifndef ENTRY2
70#  define ENTRY2(name) ENTRY (name)
71#  define END2(name) END (name)
72# endif
73
74ENTRY2 (__strcasecmp)
75	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
76	mov	%fs:(%rax),%RDX_LP
77
78	// XXX 5 byte should be before the function
79	/* 5-byte NOP.  */
80	.byte	0x0f,0x1f,0x44,0x00,0x00
81END2 (__strcasecmp)
82# ifndef NO_NOLOCALE_ALIAS
83weak_alias (__strcasecmp, strcasecmp)
84libc_hidden_def (__strcasecmp)
85# endif
86	/* FALLTHROUGH to strcasecmp_l.  */
87#elif defined USE_AS_STRNCASECMP_L
88# ifndef ENTRY2
89#  define ENTRY2(name) ENTRY (name)
90#  define END2(name) END (name)
91# endif
92
93ENTRY2 (__strncasecmp)
94	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
95	mov	%fs:(%rax),%RCX_LP
96
97	// XXX 5 byte should be before the function
98	/* 5-byte NOP.  */
99	.byte	0x0f,0x1f,0x44,0x00,0x00
100END2 (__strncasecmp)
101# ifndef NO_NOLOCALE_ALIAS
102weak_alias (__strncasecmp, strncasecmp)
103libc_hidden_def (__strncasecmp)
104# endif
105	/* FALLTHROUGH to strncasecmp_l.  */
106#endif
107
108ENTRY (STRCMP)
109#ifdef USE_AS_STRCASECMP_L
110	/* We have to fall back on the C implementation for locales
111	   with encodings not matching ASCII for single bytes.  */
112# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
113	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
114# else
115	mov	(%rdx), %RAX_LP
116# endif
117	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
118	jne	__strcasecmp_l_nonascii
119#elif defined USE_AS_STRNCASECMP_L
120	/* We have to fall back on the C implementation for locales
121	   with encodings not matching ASCII for single bytes.  */
122# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
123	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
124# else
125	mov	(%rcx), %RAX_LP
126# endif
127	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
128	jne	__strncasecmp_l_nonascii
129#endif
130
131/*
132 * This implementation uses SSE to compare up to 16 bytes at a time.
133 */
134#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
135	test	%RDX_LP, %RDX_LP
136	je	LABEL(strcmp_exitz)
137	cmp	$1, %RDX_LP
138	je	LABEL(Byte0)
139	mov	%RDX_LP, %R11_LP
140#endif
141	mov	%esi, %ecx
142	mov	%edi, %eax
143/* Use 64bit AND here to avoid long NOP padding.  */
144	and	$0x3f, %rcx		/* rsi alignment in cache line */
145	and	$0x3f, %rax		/* rdi alignment in cache line */
146#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
147	.section .rodata.cst16,"aM",@progbits,16
148	.align 16
149.Lbelowupper:
150	.quad	0x4040404040404040
151	.quad	0x4040404040404040
152.Ltopupper:
153	.quad	0x5b5b5b5b5b5b5b5b
154	.quad	0x5b5b5b5b5b5b5b5b
155.Ltouppermask:
156	.quad	0x2020202020202020
157	.quad	0x2020202020202020
158	.previous
159	movdqa	.Lbelowupper(%rip), %xmm5
160# define UCLOW_reg %xmm5
161	movdqa	.Ltopupper(%rip), %xmm6
162# define UCHIGH_reg %xmm6
163	movdqa	.Ltouppermask(%rip), %xmm7
164# define LCQWORD_reg %xmm7
165#endif
166	cmp	$0x30, %ecx
167	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
168	cmp	$0x30, %eax
169	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
170	movlpd	(%rdi), %xmm1
171	movlpd	(%rsi), %xmm2
172	movhpd	8(%rdi), %xmm1
173	movhpd	8(%rsi), %xmm2
174#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
175# define TOLOWER(reg1, reg2) \
176	movdqa	reg1, %xmm8;					\
177	movdqa	UCHIGH_reg, %xmm9;				\
178	movdqa	reg2, %xmm10;					\
179	movdqa	UCHIGH_reg, %xmm11;				\
180	pcmpgtb	UCLOW_reg, %xmm8;				\
181	pcmpgtb	reg1, %xmm9;					\
182	pcmpgtb	UCLOW_reg, %xmm10;				\
183	pcmpgtb	reg2, %xmm11;					\
184	pand	%xmm9, %xmm8;					\
185	pand	%xmm11, %xmm10;					\
186	pand	LCQWORD_reg, %xmm8;				\
187	pand	LCQWORD_reg, %xmm10;				\
188	por	%xmm8, reg1;					\
189	por	%xmm10, reg2
190	TOLOWER (%xmm1, %xmm2)
191#else
192# define TOLOWER(reg1, reg2)
193#endif
194	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
195	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
196	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
197	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
198	pmovmskb %xmm1, %edx
199	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
200	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
201#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
202	sub	$16, %r11
203	jbe	LABEL(strcmp_exitz)	/* finish comparision */
204#endif
205	add	$16, %rsi		/* prepare to search next 16 bytes */
206	add	$16, %rdi		/* prepare to search next 16 bytes */
207
208	/*
209	 * Determine source and destination string offsets from 16-byte alignment.
210	 * Use relative offset difference between the two to determine which case
211	 * below to use.
212	 */
213	.p2align 4
214LABEL(crosscache):
215	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
216	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
217	mov	$0xffff, %edx			/* for equivalent offset */
218	xor	%r8d, %r8d
219	and	$0xf, %ecx			/* offset of rsi */
220	and	$0xf, %eax			/* offset of rdi */
221	cmp	%eax, %ecx
222	je	LABEL(ashr_0)			/* rsi and rdi relative offset same */
223	ja	LABEL(bigger)
224	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
225	xchg	%ecx, %eax
226	xchg	%rsi, %rdi
227LABEL(bigger):
228	lea	15(%rax), %r9
229	sub	%rcx, %r9
230	lea	LABEL(unaligned_table)(%rip), %r10
231	movslq	(%r10, %r9,4), %r9
232	lea	(%r10, %r9), %r10
233	_CET_NOTRACK jmp *%r10			/* jump to corresponding case */
234
235/*
236 * The following cases will be handled by ashr_0
237 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
238 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
239 */
240	.p2align 4
241LABEL(ashr_0):
242
243	movdqa	(%rsi), %xmm1
244	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
245	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
246#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
247	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
248#else
249	movdqa	(%rdi), %xmm2
250	TOLOWER (%xmm1, %xmm2)
251	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
252#endif
253	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
254	pmovmskb %xmm1, %r9d
255	shr	%cl, %edx			/* adjust 0xffff for offset */
256	shr	%cl, %r9d			/* adjust for 16-byte offset */
257	sub	%r9d, %edx
258	/*
259	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
260	 * the start from (16-rax) and no null char was seen.
261	 */
262	jne	LABEL(less32bytes)		/* mismatch or null char */
263	UPDATE_STRNCMP_COUNTER
264	mov	$16, %rcx
265	mov	$16, %r9
266	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
267
268	/*
269	 * Now both strings are aligned at 16-byte boundary. Loop over strings
270	 * checking 32-bytes per iteration.
271	 */
272	.p2align 4
273LABEL(loop_ashr_0):
274	movdqa	(%rsi, %rcx), %xmm1
275	movdqa	(%rdi, %rcx), %xmm2
276	TOLOWER (%xmm1, %xmm2)
277
278	pcmpeqb	%xmm1, %xmm0
279	pcmpeqb	%xmm2, %xmm1
280	psubb	%xmm0, %xmm1
281	pmovmskb %xmm1, %edx
282	sub	$0xffff, %edx
283	jnz	LABEL(exit)		/* mismatch or null char seen */
284
285#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
286	sub	$16, %r11
287	jbe	LABEL(strcmp_exitz)
288#endif
289	add	$16, %rcx
290	movdqa	(%rsi, %rcx), %xmm1
291	movdqa	(%rdi, %rcx), %xmm2
292	TOLOWER (%xmm1, %xmm2)
293
294	pcmpeqb	%xmm1, %xmm0
295	pcmpeqb	%xmm2, %xmm1
296	psubb	%xmm0, %xmm1
297	pmovmskb %xmm1, %edx
298	sub	$0xffff, %edx
299	jnz	LABEL(exit)
300#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
301	sub	$16, %r11
302	jbe	LABEL(strcmp_exitz)
303#endif
304	add	$16, %rcx
305	jmp	LABEL(loop_ashr_0)
306
307/*
308 * The following cases will be handled by ashr_1
309 * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
310 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
311 */
312	.p2align 4
313LABEL(ashr_1):
314	pxor	%xmm0, %xmm0
315	movdqa	(%rdi), %xmm2
316	movdqa	(%rsi), %xmm1
317	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
318	pslldq	$15, %xmm2		/* shift first string to align with second */
319	TOLOWER (%xmm1, %xmm2)
320	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
321	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
322	pmovmskb %xmm2, %r9d
323	shr	%cl, %edx		/* adjust 0xffff for offset */
324	shr	%cl, %r9d		/* adjust for 16-byte offset */
325	sub	%r9d, %edx
326	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
327	movdqa	(%rdi), %xmm3
328	UPDATE_STRNCMP_COUNTER
329
330	pxor	%xmm0, %xmm0
331	mov	$16, %rcx		/* index for loads*/
332	mov	$1, %r9d		/* byte position left over from less32bytes case */
333	/*
334	 * Setup %r10 value allows us to detect crossing a page boundary.
335	 * When %r10 goes positive we have crossed a page boundary and
336	 * need to do a nibble.
337	 */
338	lea	1(%rdi), %r10
339	and	$0xfff, %r10		/* offset into 4K page */
340	sub	$0x1000, %r10		/* subtract 4K pagesize */
341
342	.p2align 4
343LABEL(loop_ashr_1):
344	add	$16, %r10
345	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
346
347LABEL(gobble_ashr_1):
348	movdqa	(%rsi, %rcx), %xmm1
349	movdqa	(%rdi, %rcx), %xmm2
350	movdqa	%xmm2, %xmm4		 /* store for next cycle */
351
352#ifndef USE_SSSE3
353	psrldq	$1, %xmm3
354	pslldq	$15, %xmm2
355	por	%xmm3, %xmm2		/* merge into one 16byte value */
356#else
357	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
358#endif
359	TOLOWER (%xmm1, %xmm2)
360
361	pcmpeqb	%xmm1, %xmm0
362	pcmpeqb	%xmm2, %xmm1
363	psubb	%xmm0, %xmm1
364	pmovmskb %xmm1, %edx
365	sub	$0xffff, %edx
366	jnz	LABEL(exit)
367
368#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
369	sub	$16, %r11
370	jbe	LABEL(strcmp_exitz)
371#endif
372	add	$16, %rcx
373	movdqa	%xmm4, %xmm3
374
375	add	$16, %r10
376	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
377
378	movdqa	(%rsi, %rcx), %xmm1
379	movdqa	(%rdi, %rcx), %xmm2
380	movdqa	%xmm2, %xmm4		/* store for next cycle */
381
382#ifndef USE_SSSE3
383	psrldq	$1, %xmm3
384	pslldq	$15, %xmm2
385	por	%xmm3, %xmm2		/* merge into one 16byte value */
386#else
387	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
388#endif
389	TOLOWER (%xmm1, %xmm2)
390
391	pcmpeqb	%xmm1, %xmm0
392	pcmpeqb	%xmm2, %xmm1
393	psubb	%xmm0, %xmm1
394	pmovmskb %xmm1, %edx
395	sub	$0xffff, %edx
396	jnz	LABEL(exit)
397
398#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
399	sub	$16, %r11
400	jbe	LABEL(strcmp_exitz)
401#endif
402	add	$16, %rcx
403	movdqa	%xmm4, %xmm3
404	jmp	LABEL(loop_ashr_1)
405
406	/*
407	 * Nibble avoids loads across page boundary. This is to avoid a potential
408	 * access into unmapped memory.
409	 */
410	.p2align 4
411LABEL(nibble_ashr_1):
412	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
413	pmovmskb %xmm0, %edx
414	test	$0xfffe, %edx
415	jnz	LABEL(ashr_1_exittail)	/* find null char*/
416
417#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
418	cmp	$15, %r11
419	jbe	LABEL(ashr_1_exittail)
420#endif
421
422	pxor	%xmm0, %xmm0
423	sub	$0x1000, %r10		/* substract 4K from %r10 */
424	jmp	LABEL(gobble_ashr_1)
425
426	/*
427	 * Once find null char, determine if there is a string mismatch
428	 * before the null char.
429	 */
430	.p2align 4
431LABEL(ashr_1_exittail):
432	movdqa	(%rsi, %rcx), %xmm1
433	psrldq	$1, %xmm0
434	psrldq	$1, %xmm3
435	jmp	LABEL(aftertail)
436
437/*
438 * The following cases will be handled by ashr_2
439 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   corresponding case
440 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
441 */
442	.p2align 4
443LABEL(ashr_2):
444	pxor	%xmm0, %xmm0
445	movdqa	(%rdi), %xmm2
446	movdqa	(%rsi), %xmm1
447	pcmpeqb	%xmm1, %xmm0
448	pslldq	$14, %xmm2
449	TOLOWER (%xmm1, %xmm2)
450	pcmpeqb	%xmm1, %xmm2
451	psubb	%xmm0, %xmm2
452	pmovmskb %xmm2, %r9d
453	shr	%cl, %edx
454	shr	%cl, %r9d
455	sub	%r9d, %edx
456	jnz	LABEL(less32bytes)
457	movdqa	(%rdi), %xmm3
458	UPDATE_STRNCMP_COUNTER
459
460	pxor	%xmm0, %xmm0
461	mov	$16, %rcx	/* index for loads */
462	mov	$2, %r9d	/* byte position left over from less32bytes case */
463	/*
464	 * Setup %r10 value allows us to detect crossing a page boundary.
465	 * When %r10 goes positive we have crossed a page boundary and
466	 * need to do a nibble.
467	 */
468	lea	2(%rdi), %r10
469	and	$0xfff, %r10	/* offset into 4K page */
470	sub	$0x1000, %r10	/* subtract 4K pagesize */
471
472	.p2align 4
473LABEL(loop_ashr_2):
474	add	$16, %r10
475	jg	LABEL(nibble_ashr_2)
476
477LABEL(gobble_ashr_2):
478	movdqa	(%rsi, %rcx), %xmm1
479	movdqa	(%rdi, %rcx), %xmm2
480	movdqa	%xmm2, %xmm4
481
482#ifndef USE_SSSE3
483	psrldq	$2, %xmm3
484	pslldq	$14, %xmm2
485	por	%xmm3, %xmm2		/* merge into one 16byte value */
486#else
487	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
488#endif
489	TOLOWER (%xmm1, %xmm2)
490
491	pcmpeqb	%xmm1, %xmm0
492	pcmpeqb	%xmm2, %xmm1
493	psubb	%xmm0, %xmm1
494	pmovmskb %xmm1, %edx
495	sub	$0xffff, %edx
496	jnz	LABEL(exit)
497
498#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
499	sub	$16, %r11
500	jbe	LABEL(strcmp_exitz)
501#endif
502
503	add	$16, %rcx
504	movdqa	%xmm4, %xmm3
505
506	add	$16, %r10
507	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
508
509	movdqa	(%rsi, %rcx), %xmm1
510	movdqa	(%rdi, %rcx), %xmm2
511	movdqa	%xmm2, %xmm4
512
513#ifndef USE_SSSE3
514	psrldq	$2, %xmm3
515	pslldq	$14, %xmm2
516	por	%xmm3, %xmm2		/* merge into one 16byte value */
517#else
518	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
519#endif
520	TOLOWER (%xmm1, %xmm2)
521
522	pcmpeqb	%xmm1, %xmm0
523	pcmpeqb	%xmm2, %xmm1
524	psubb	%xmm0, %xmm1
525	pmovmskb %xmm1, %edx
526	sub	$0xffff, %edx
527	jnz	LABEL(exit)
528
529#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
530	sub	$16, %r11
531	jbe	LABEL(strcmp_exitz)
532#endif
533
534	add	$16, %rcx
535	movdqa	%xmm4, %xmm3
536	jmp	LABEL(loop_ashr_2)
537
538	.p2align 4
539LABEL(nibble_ashr_2):
540	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
541	pmovmskb %xmm0, %edx
542	test	$0xfffc, %edx
543	jnz	LABEL(ashr_2_exittail)
544
545#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
546	cmp	$14, %r11
547	jbe	LABEL(ashr_2_exittail)
548#endif
549
550	pxor	%xmm0, %xmm0
551	sub	$0x1000, %r10
552	jmp	LABEL(gobble_ashr_2)
553
554	.p2align 4
555LABEL(ashr_2_exittail):
556	movdqa	(%rsi, %rcx), %xmm1
557	psrldq	$2, %xmm0
558	psrldq	$2, %xmm3
559	jmp	LABEL(aftertail)
560
561/*
562 * The following cases will be handled by ashr_3
563 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
564 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
565 */
566	.p2align 4
567LABEL(ashr_3):
568	pxor	%xmm0, %xmm0
569	movdqa	(%rdi), %xmm2
570	movdqa	(%rsi), %xmm1
571	pcmpeqb	%xmm1, %xmm0
572	pslldq	$13, %xmm2
573	TOLOWER (%xmm1, %xmm2)
574	pcmpeqb	%xmm1, %xmm2
575	psubb	%xmm0, %xmm2
576	pmovmskb %xmm2, %r9d
577	shr	%cl, %edx
578	shr	%cl, %r9d
579	sub	%r9d, %edx
580	jnz	LABEL(less32bytes)
581	movdqa	(%rdi), %xmm3
582
583	UPDATE_STRNCMP_COUNTER
584
585	pxor	%xmm0, %xmm0
586	mov	$16, %rcx	/* index for loads */
587	mov	$3, %r9d	/* byte position left over from less32bytes case */
588	/*
589	 * Setup %r10 value allows us to detect crossing a page boundary.
590	 * When %r10 goes positive we have crossed a page boundary and
591	 * need to do a nibble.
592	 */
593	lea	3(%rdi), %r10
594	and	$0xfff, %r10	/* offset into 4K page */
595	sub	$0x1000, %r10	/* subtract 4K pagesize */
596
597	.p2align 4
598LABEL(loop_ashr_3):
599	add	$16, %r10
600	jg	LABEL(nibble_ashr_3)
601
602LABEL(gobble_ashr_3):
603	movdqa	(%rsi, %rcx), %xmm1
604	movdqa	(%rdi, %rcx), %xmm2
605	movdqa	%xmm2, %xmm4
606
607#ifndef USE_SSSE3
608	psrldq	$3, %xmm3
609	pslldq	$13, %xmm2
610	por	%xmm3, %xmm2		/* merge into one 16byte value */
611#else
612	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
613#endif
614	TOLOWER (%xmm1, %xmm2)
615
616	pcmpeqb	%xmm1, %xmm0
617	pcmpeqb	%xmm2, %xmm1
618	psubb	%xmm0, %xmm1
619	pmovmskb %xmm1, %edx
620	sub	$0xffff, %edx
621	jnz	LABEL(exit)
622
623#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
624	sub	$16, %r11
625	jbe	LABEL(strcmp_exitz)
626#endif
627
628	add	$16, %rcx
629	movdqa	%xmm4, %xmm3
630
631	add	$16, %r10
632	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
633
634	movdqa	(%rsi, %rcx), %xmm1
635	movdqa	(%rdi, %rcx), %xmm2
636	movdqa	%xmm2, %xmm4
637
638#ifndef USE_SSSE3
639	psrldq	$3, %xmm3
640	pslldq	$13, %xmm2
641	por	%xmm3, %xmm2		/* merge into one 16byte value */
642#else
643	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
644#endif
645	TOLOWER (%xmm1, %xmm2)
646
647	pcmpeqb	%xmm1, %xmm0
648	pcmpeqb	%xmm2, %xmm1
649	psubb	%xmm0, %xmm1
650	pmovmskb %xmm1, %edx
651	sub	$0xffff, %edx
652	jnz	LABEL(exit)
653
654#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
655	sub	$16, %r11
656	jbe	LABEL(strcmp_exitz)
657#endif
658
659	add	$16, %rcx
660	movdqa	%xmm4, %xmm3
661	jmp	LABEL(loop_ashr_3)
662
663	.p2align 4
664LABEL(nibble_ashr_3):
665	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
666	pmovmskb %xmm0, %edx
667	test	$0xfff8, %edx
668	jnz	LABEL(ashr_3_exittail)
669
670#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
671	cmp	$13, %r11
672	jbe	LABEL(ashr_3_exittail)
673#endif
674
675	pxor	%xmm0, %xmm0
676	sub	$0x1000, %r10
677	jmp	LABEL(gobble_ashr_3)
678
679	.p2align 4
680LABEL(ashr_3_exittail):
681	movdqa	(%rsi, %rcx), %xmm1
682	psrldq	$3, %xmm0
683	psrldq	$3, %xmm3
684	jmp	LABEL(aftertail)
685
686/*
687 * The following cases will be handled by ashr_4
688 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
689 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
690 */
691	.p2align 4
692LABEL(ashr_4):
693	pxor	%xmm0, %xmm0
694	movdqa	(%rdi), %xmm2
695	movdqa	(%rsi), %xmm1
696	pcmpeqb	%xmm1, %xmm0
697	pslldq	$12, %xmm2
698	TOLOWER (%xmm1, %xmm2)
699	pcmpeqb	%xmm1, %xmm2
700	psubb	%xmm0, %xmm2
701	pmovmskb %xmm2, %r9d
702	shr	%cl, %edx
703	shr	%cl, %r9d
704	sub	%r9d, %edx
705	jnz	LABEL(less32bytes)
706	movdqa	(%rdi), %xmm3
707
708	UPDATE_STRNCMP_COUNTER
709
710	pxor	%xmm0, %xmm0
711	mov	$16, %rcx	/* index for loads */
712	mov	$4, %r9d	/* byte position left over from less32bytes case */
713	/*
714	 * Setup %r10 value allows us to detect crossing a page boundary.
715	 * When %r10 goes positive we have crossed a page boundary and
716	 * need to do a nibble.
717	 */
718	lea	4(%rdi), %r10
719	and	$0xfff, %r10	/* offset into 4K page */
720	sub	$0x1000, %r10	/* subtract 4K pagesize */
721
722	.p2align 4
723LABEL(loop_ashr_4):
724	add	$16, %r10
725	jg	LABEL(nibble_ashr_4)
726
727LABEL(gobble_ashr_4):
728	movdqa	(%rsi, %rcx), %xmm1
729	movdqa	(%rdi, %rcx), %xmm2
730	movdqa	%xmm2, %xmm4
731
732#ifndef USE_SSSE3
733	psrldq	$4, %xmm3
734	pslldq	$12, %xmm2
735	por	%xmm3, %xmm2		/* merge into one 16byte value */
736#else
737	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
738#endif
739	TOLOWER (%xmm1, %xmm2)
740
741	pcmpeqb	%xmm1, %xmm0
742	pcmpeqb	%xmm2, %xmm1
743	psubb	%xmm0, %xmm1
744	pmovmskb %xmm1, %edx
745	sub	$0xffff, %edx
746	jnz	LABEL(exit)
747
748#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
749	sub	$16, %r11
750	jbe	LABEL(strcmp_exitz)
751#endif
752
753	add	$16, %rcx
754	movdqa	%xmm4, %xmm3
755
756	add	$16, %r10
757	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
758
759	movdqa	(%rsi, %rcx), %xmm1
760	movdqa	(%rdi, %rcx), %xmm2
761	movdqa	%xmm2, %xmm4
762
763#ifndef USE_SSSE3
764	psrldq	$4, %xmm3
765	pslldq	$12, %xmm2
766	por	%xmm3, %xmm2		/* merge into one 16byte value */
767#else
768	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
769#endif
770	TOLOWER (%xmm1, %xmm2)
771
772	pcmpeqb	%xmm1, %xmm0
773	pcmpeqb	%xmm2, %xmm1
774	psubb	%xmm0, %xmm1
775	pmovmskb %xmm1, %edx
776	sub	$0xffff, %edx
777	jnz	LABEL(exit)
778
779#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
780	sub	$16, %r11
781	jbe	LABEL(strcmp_exitz)
782#endif
783
784	add	$16, %rcx
785	movdqa	%xmm4, %xmm3
786	jmp	LABEL(loop_ashr_4)
787
788	.p2align 4
789LABEL(nibble_ashr_4):
790	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
791	pmovmskb %xmm0, %edx
792	test	$0xfff0, %edx
793	jnz	LABEL(ashr_4_exittail)
794
795#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
796	cmp	$12, %r11
797	jbe	LABEL(ashr_4_exittail)
798#endif
799
800	pxor	%xmm0, %xmm0
801	sub	$0x1000, %r10
802	jmp	LABEL(gobble_ashr_4)
803
804	.p2align 4
805LABEL(ashr_4_exittail):
806	movdqa	(%rsi, %rcx), %xmm1
807	psrldq	$4, %xmm0
808	psrldq	$4, %xmm3
809	jmp	LABEL(aftertail)
810
811/*
812 * The following cases will be handled by ashr_5
813 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
814 *        n(11~15)          n - 11		  4(15 +(n-11) - n)         ashr_5
815 */
816	.p2align 4
817LABEL(ashr_5):
818	pxor	%xmm0, %xmm0
819	movdqa	(%rdi), %xmm2
820	movdqa	(%rsi), %xmm1
821	pcmpeqb	%xmm1, %xmm0
822	pslldq	$11, %xmm2
823	TOLOWER (%xmm1, %xmm2)
824	pcmpeqb	%xmm1, %xmm2
825	psubb	%xmm0, %xmm2
826	pmovmskb %xmm2, %r9d
827	shr	%cl, %edx
828	shr	%cl, %r9d
829	sub	%r9d, %edx
830	jnz	LABEL(less32bytes)
831	movdqa	(%rdi), %xmm3
832
833	UPDATE_STRNCMP_COUNTER
834
835	pxor	%xmm0, %xmm0
836	mov	$16, %rcx	/* index for loads */
837	mov	$5, %r9d	/* byte position left over from less32bytes case */
838	/*
839	 * Setup %r10 value allows us to detect crossing a page boundary.
840	 * When %r10 goes positive we have crossed a page boundary and
841	 * need to do a nibble.
842	 */
843	lea	5(%rdi), %r10
844	and	$0xfff, %r10	/* offset into 4K page */
845	sub	$0x1000, %r10	/* subtract 4K pagesize */
846
847	.p2align 4
848LABEL(loop_ashr_5):
849	add	$16, %r10
850	jg	LABEL(nibble_ashr_5)
851
852LABEL(gobble_ashr_5):
853	movdqa	(%rsi, %rcx), %xmm1
854	movdqa	(%rdi, %rcx), %xmm2
855	movdqa	%xmm2, %xmm4
856
857#ifndef USE_SSSE3
858	psrldq	$5, %xmm3
859	pslldq	$11, %xmm2
860	por	%xmm3, %xmm2		/* merge into one 16byte value */
861#else
862	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
863#endif
864	TOLOWER (%xmm1, %xmm2)
865
866	pcmpeqb	%xmm1, %xmm0
867	pcmpeqb	%xmm2, %xmm1
868	psubb	%xmm0, %xmm1
869	pmovmskb %xmm1, %edx
870	sub	$0xffff, %edx
871	jnz	LABEL(exit)
872
873#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
874	sub	$16, %r11
875	jbe	LABEL(strcmp_exitz)
876#endif
877
878	add	$16, %rcx
879	movdqa	%xmm4, %xmm3
880
881	add	$16, %r10
882	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
883
884	movdqa	(%rsi, %rcx), %xmm1
885	movdqa	(%rdi, %rcx), %xmm2
886	movdqa	%xmm2, %xmm4
887
888#ifndef USE_SSSE3
889	psrldq	$5, %xmm3
890	pslldq	$11, %xmm2
891	por	%xmm3, %xmm2		/* merge into one 16byte value */
892#else
893	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
894#endif
895	TOLOWER (%xmm1, %xmm2)
896
897	pcmpeqb	%xmm1, %xmm0
898	pcmpeqb	%xmm2, %xmm1
899	psubb	%xmm0, %xmm1
900	pmovmskb %xmm1, %edx
901	sub	$0xffff, %edx
902	jnz	LABEL(exit)
903
904#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
905	sub	$16, %r11
906	jbe	LABEL(strcmp_exitz)
907#endif
908
909	add	$16, %rcx
910	movdqa	%xmm4, %xmm3
911	jmp	LABEL(loop_ashr_5)
912
913	.p2align 4
914LABEL(nibble_ashr_5):
915	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
916	pmovmskb %xmm0, %edx
917	test	$0xffe0, %edx
918	jnz	LABEL(ashr_5_exittail)
919
920#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
921	cmp	$11, %r11
922	jbe	LABEL(ashr_5_exittail)
923#endif
924
925	pxor	%xmm0, %xmm0
926	sub	$0x1000, %r10
927	jmp	LABEL(gobble_ashr_5)
928
929	.p2align 4
930LABEL(ashr_5_exittail):
931	movdqa	(%rsi, %rcx), %xmm1
932	psrldq	$5, %xmm0
933	psrldq	$5, %xmm3
934	jmp	LABEL(aftertail)
935
936/*
937 * The following cases will be handled by ashr_6
938 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
939 *        n(10~15)          n - 10		  5(15 +(n-10) - n)         ashr_6
940 */
941	.p2align 4
942LABEL(ashr_6):
943	pxor	%xmm0, %xmm0
944	movdqa	(%rdi), %xmm2
945	movdqa	(%rsi), %xmm1
946	pcmpeqb	%xmm1, %xmm0
947	pslldq	$10, %xmm2
948	TOLOWER (%xmm1, %xmm2)
949	pcmpeqb	%xmm1, %xmm2
950	psubb	%xmm0, %xmm2
951	pmovmskb %xmm2, %r9d
952	shr	%cl, %edx
953	shr	%cl, %r9d
954	sub	%r9d, %edx
955	jnz	LABEL(less32bytes)
956	movdqa	(%rdi), %xmm3
957
958	UPDATE_STRNCMP_COUNTER
959
960	pxor	%xmm0, %xmm0
961	mov	$16, %rcx	/* index for loads */
962	mov	$6, %r9d	/* byte position left over from less32bytes case */
963	/*
964	 * Setup %r10 value allows us to detect crossing a page boundary.
965	 * When %r10 goes positive we have crossed a page boundary and
966	 * need to do a nibble.
967	 */
968	lea	6(%rdi), %r10
969	and	$0xfff, %r10	/* offset into 4K page */
970	sub	$0x1000, %r10	/* subtract 4K pagesize */
971
972	.p2align 4
973LABEL(loop_ashr_6):
974	add	$16, %r10
975	jg	LABEL(nibble_ashr_6)
976
977LABEL(gobble_ashr_6):
978	movdqa	(%rsi, %rcx), %xmm1
979	movdqa	(%rdi, %rcx), %xmm2
980	movdqa	%xmm2, %xmm4
981
982#ifndef USE_SSSE3
983	psrldq	$6, %xmm3
984	pslldq	$10, %xmm2
985	por	%xmm3, %xmm2		/* merge into one 16byte value */
986#else
987	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
988#endif
989	TOLOWER (%xmm1, %xmm2)
990
991	pcmpeqb	%xmm1, %xmm0
992	pcmpeqb	%xmm2, %xmm1
993	psubb	%xmm0, %xmm1
994	pmovmskb %xmm1, %edx
995	sub	$0xffff, %edx
996	jnz	LABEL(exit)
997
998#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
999	sub	$16, %r11
1000	jbe	LABEL(strcmp_exitz)
1001#endif
1002
1003	add	$16, %rcx
1004	movdqa	%xmm4, %xmm3
1005
1006	add	$16, %r10
1007	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
1008
1009	movdqa	(%rsi, %rcx), %xmm1
1010	movdqa	(%rdi, %rcx), %xmm2
1011	movdqa	%xmm2, %xmm4
1012
1013#ifndef USE_SSSE3
1014	psrldq	$6, %xmm3
1015	pslldq	$10, %xmm2
1016	por	%xmm3, %xmm2		/* merge into one 16byte value */
1017#else
1018	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
1019#endif
1020	TOLOWER (%xmm1, %xmm2)
1021
1022	pcmpeqb	%xmm1, %xmm0
1023	pcmpeqb	%xmm2, %xmm1
1024	psubb	%xmm0, %xmm1
1025	pmovmskb %xmm1, %edx
1026	sub	$0xffff, %edx
1027	jnz	LABEL(exit)
1028
1029#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1030	sub	$16, %r11
1031	jbe	LABEL(strcmp_exitz)
1032#endif
1033
1034	add	$16, %rcx
1035	movdqa	%xmm4, %xmm3
1036	jmp	LABEL(loop_ashr_6)
1037
1038	.p2align 4
1039LABEL(nibble_ashr_6):
1040	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1041	pmovmskb %xmm0, %edx
1042	test	$0xffc0, %edx
1043	jnz	LABEL(ashr_6_exittail)
1044
1045#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1046	cmp	$10, %r11
1047	jbe	LABEL(ashr_6_exittail)
1048#endif
1049
1050	pxor	%xmm0, %xmm0
1051	sub	$0x1000, %r10
1052	jmp	LABEL(gobble_ashr_6)
1053
1054	.p2align 4
1055LABEL(ashr_6_exittail):
1056	movdqa	(%rsi, %rcx), %xmm1
1057	psrldq	$6, %xmm0
1058	psrldq	$6, %xmm3
1059	jmp	LABEL(aftertail)
1060
1061/*
1062 * The following cases will be handled by ashr_7
1063 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
1064 *        n(9~15)          n - 9                6(15 +(n - 9) - n)         ashr_7
1065 */
1066	.p2align 4
1067LABEL(ashr_7):
1068	pxor	%xmm0, %xmm0
1069	movdqa	(%rdi), %xmm2
1070	movdqa	(%rsi), %xmm1
1071	pcmpeqb	%xmm1, %xmm0
1072	pslldq	$9, %xmm2
1073	TOLOWER (%xmm1, %xmm2)
1074	pcmpeqb	%xmm1, %xmm2
1075	psubb	%xmm0, %xmm2
1076	pmovmskb %xmm2, %r9d
1077	shr	%cl, %edx
1078	shr	%cl, %r9d
1079	sub	%r9d, %edx
1080	jnz	LABEL(less32bytes)
1081	movdqa	(%rdi), %xmm3
1082
1083	UPDATE_STRNCMP_COUNTER
1084
1085	pxor	%xmm0, %xmm0
1086	mov	$16, %rcx	/* index for loads */
1087	mov	$7, %r9d	/* byte position left over from less32bytes case */
1088	/*
1089	 * Setup %r10 value allows us to detect crossing a page boundary.
1090	 * When %r10 goes positive we have crossed a page boundary and
1091	 * need to do a nibble.
1092	 */
1093	lea	7(%rdi), %r10
1094	and	$0xfff, %r10	/* offset into 4K page */
1095	sub	$0x1000, %r10	/* subtract 4K pagesize */
1096
1097	.p2align 4
1098LABEL(loop_ashr_7):
1099	add	$16, %r10
1100	jg	LABEL(nibble_ashr_7)
1101
1102LABEL(gobble_ashr_7):
1103	movdqa	(%rsi, %rcx), %xmm1
1104	movdqa	(%rdi, %rcx), %xmm2
1105	movdqa	%xmm2, %xmm4
1106
1107#ifndef USE_SSSE3
1108	psrldq	$7, %xmm3
1109	pslldq	$9, %xmm2
1110	por	%xmm3, %xmm2		/* merge into one 16byte value */
1111#else
1112	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
1113#endif
1114	TOLOWER (%xmm1, %xmm2)
1115
1116	pcmpeqb	%xmm1, %xmm0
1117	pcmpeqb	%xmm2, %xmm1
1118	psubb	%xmm0, %xmm1
1119	pmovmskb %xmm1, %edx
1120	sub	$0xffff, %edx
1121	jnz	LABEL(exit)
1122
1123#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1124	sub	$16, %r11
1125	jbe	LABEL(strcmp_exitz)
1126#endif
1127
1128	add	$16, %rcx
1129	movdqa	%xmm4, %xmm3
1130
1131	add	$16, %r10
1132	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
1133
1134	movdqa	(%rsi, %rcx), %xmm1
1135	movdqa	(%rdi, %rcx), %xmm2
1136	movdqa	%xmm2, %xmm4
1137
1138#ifndef USE_SSSE3
1139	psrldq	$7, %xmm3
1140	pslldq	$9, %xmm2
1141	por	%xmm3, %xmm2		/* merge into one 16byte value */
1142#else
1143	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
1144#endif
1145	TOLOWER (%xmm1, %xmm2)
1146
1147	pcmpeqb	%xmm1, %xmm0
1148	pcmpeqb	%xmm2, %xmm1
1149	psubb	%xmm0, %xmm1
1150	pmovmskb %xmm1, %edx
1151	sub	$0xffff, %edx
1152	jnz	LABEL(exit)
1153
1154#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1155	sub	$16, %r11
1156	jbe	LABEL(strcmp_exitz)
1157#endif
1158
1159	add	$16, %rcx
1160	movdqa	%xmm4, %xmm3
1161	jmp	LABEL(loop_ashr_7)
1162
1163	.p2align 4
1164LABEL(nibble_ashr_7):
1165	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1166	pmovmskb %xmm0, %edx
1167	test	$0xff80, %edx
1168	jnz	LABEL(ashr_7_exittail)
1169
1170#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1171	cmp	$9, %r11
1172	jbe	LABEL(ashr_7_exittail)
1173#endif
1174
1175	pxor	%xmm0, %xmm0
1176	sub	$0x1000, %r10
1177	jmp	LABEL(gobble_ashr_7)
1178
1179	.p2align 4
1180LABEL(ashr_7_exittail):
1181	movdqa	(%rsi, %rcx), %xmm1
1182	psrldq	$7, %xmm0
1183	psrldq	$7, %xmm3
1184	jmp	LABEL(aftertail)
1185
1186/*
1187 *  The following cases will be handled by ashr_8
1188 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1189 *        n(8~15)          n - 8                7(15 +(n - 8) - n)         ashr_8
1190 */
1191	.p2align 4
1192LABEL(ashr_8):
1193	pxor	%xmm0, %xmm0
1194	movdqa	(%rdi), %xmm2
1195	movdqa	(%rsi), %xmm1
1196	pcmpeqb	%xmm1, %xmm0
1197	pslldq	$8, %xmm2
1198	TOLOWER (%xmm1, %xmm2)
1199	pcmpeqb	%xmm1, %xmm2
1200	psubb	%xmm0, %xmm2
1201	pmovmskb %xmm2, %r9d
1202	shr	%cl, %edx
1203	shr	%cl, %r9d
1204	sub	%r9d, %edx
1205	jnz	LABEL(less32bytes)
1206	movdqa	(%rdi), %xmm3
1207
1208	UPDATE_STRNCMP_COUNTER
1209
1210	pxor	%xmm0, %xmm0
1211	mov	$16, %rcx	/* index for loads */
1212	mov	$8, %r9d	/* byte position left over from less32bytes case */
1213	/*
1214	 * Setup %r10 value allows us to detect crossing a page boundary.
1215	 * When %r10 goes positive we have crossed a page boundary and
1216	 * need to do a nibble.
1217	 */
1218	lea	8(%rdi), %r10
1219	and	$0xfff, %r10	/* offset into 4K page */
1220	sub	$0x1000, %r10	/* subtract 4K pagesize */
1221
1222	.p2align 4
1223LABEL(loop_ashr_8):
1224	add	$16, %r10
1225	jg	LABEL(nibble_ashr_8)
1226
1227LABEL(gobble_ashr_8):
1228	movdqa	(%rsi, %rcx), %xmm1
1229	movdqa	(%rdi, %rcx), %xmm2
1230	movdqa	%xmm2, %xmm4
1231
1232#ifndef USE_SSSE3
1233	psrldq	$8, %xmm3
1234	pslldq	$8, %xmm2
1235	por	%xmm3, %xmm2		/* merge into one 16byte value */
1236#else
1237	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
1238#endif
1239	TOLOWER (%xmm1, %xmm2)
1240
1241	pcmpeqb	%xmm1, %xmm0
1242	pcmpeqb	%xmm2, %xmm1
1243	psubb	%xmm0, %xmm1
1244	pmovmskb %xmm1, %edx
1245	sub	$0xffff, %edx
1246	jnz	LABEL(exit)
1247
1248#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1249	sub	$16, %r11
1250	jbe	LABEL(strcmp_exitz)
1251#endif
1252
1253	add	$16, %rcx
1254	movdqa	%xmm4, %xmm3
1255
1256	add	$16, %r10
1257	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
1258
1259	movdqa	(%rsi, %rcx), %xmm1
1260	movdqa	(%rdi, %rcx), %xmm2
1261	movdqa	%xmm2, %xmm4
1262
1263#ifndef USE_SSSE3
1264	psrldq	$8, %xmm3
1265	pslldq	$8, %xmm2
1266	por	%xmm3, %xmm2		/* merge into one 16byte value */
1267#else
1268	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
1269#endif
1270	TOLOWER (%xmm1, %xmm2)
1271
1272	pcmpeqb	%xmm1, %xmm0
1273	pcmpeqb	%xmm2, %xmm1
1274	psubb	%xmm0, %xmm1
1275	pmovmskb %xmm1, %edx
1276	sub	$0xffff, %edx
1277	jnz	LABEL(exit)
1278
1279#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1280	sub	$16, %r11
1281	jbe	LABEL(strcmp_exitz)
1282#endif
1283
1284	add	$16, %rcx
1285	movdqa	%xmm4, %xmm3
1286	jmp	LABEL(loop_ashr_8)
1287
1288	.p2align 4
1289LABEL(nibble_ashr_8):
1290	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1291	pmovmskb %xmm0, %edx
1292	test	$0xff00, %edx
1293	jnz	LABEL(ashr_8_exittail)
1294
1295#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1296	cmp	$8, %r11
1297	jbe	LABEL(ashr_8_exittail)
1298#endif
1299
1300	pxor	%xmm0, %xmm0
1301	sub	$0x1000, %r10
1302	jmp	LABEL(gobble_ashr_8)
1303
1304	.p2align 4
1305LABEL(ashr_8_exittail):
1306	movdqa	(%rsi, %rcx), %xmm1
1307	psrldq	$8, %xmm0
1308	psrldq	$8, %xmm3
1309	jmp	LABEL(aftertail)
1310
1311/*
1312 *  The following cases will be handled by ashr_9
1313 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1314 *        n(7~15)          n - 7                8(15 +(n - 7) - n)         ashr_9
1315 */
1316	.p2align 4
1317LABEL(ashr_9):
1318	pxor	%xmm0, %xmm0
1319	movdqa	(%rdi), %xmm2
1320	movdqa	(%rsi), %xmm1
1321	pcmpeqb	%xmm1, %xmm0
1322	pslldq	$7, %xmm2
1323	TOLOWER (%xmm1, %xmm2)
1324	pcmpeqb	%xmm1, %xmm2
1325	psubb	%xmm0, %xmm2
1326	pmovmskb %xmm2, %r9d
1327	shr	%cl, %edx
1328	shr	%cl, %r9d
1329	sub	%r9d, %edx
1330	jnz	LABEL(less32bytes)
1331	movdqa	(%rdi), %xmm3
1332
1333	UPDATE_STRNCMP_COUNTER
1334
1335	pxor	%xmm0, %xmm0
1336	mov	$16, %rcx	/* index for loads */
1337	mov	$9, %r9d	/* byte position left over from less32bytes case */
1338	/*
1339	 * Setup %r10 value allows us to detect crossing a page boundary.
1340	 * When %r10 goes positive we have crossed a page boundary and
1341	 * need to do a nibble.
1342	 */
1343	lea	9(%rdi), %r10
1344	and	$0xfff, %r10	/* offset into 4K page */
1345	sub	$0x1000, %r10	/* subtract 4K pagesize */
1346
1347	.p2align 4
1348LABEL(loop_ashr_9):
1349	add	$16, %r10
1350	jg	LABEL(nibble_ashr_9)
1351
1352LABEL(gobble_ashr_9):
1353	movdqa	(%rsi, %rcx), %xmm1
1354	movdqa	(%rdi, %rcx), %xmm2
1355	movdqa	%xmm2, %xmm4
1356
1357#ifndef USE_SSSE3
1358	psrldq	$9, %xmm3
1359	pslldq	$7, %xmm2
1360	por	%xmm3, %xmm2		/* merge into one 16byte value */
1361#else
1362	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
1363#endif
1364	TOLOWER (%xmm1, %xmm2)
1365
1366	pcmpeqb	%xmm1, %xmm0
1367	pcmpeqb	%xmm2, %xmm1
1368	psubb	%xmm0, %xmm1
1369	pmovmskb %xmm1, %edx
1370	sub	$0xffff, %edx
1371	jnz	LABEL(exit)
1372
1373#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1374	sub	$16, %r11
1375	jbe	LABEL(strcmp_exitz)
1376#endif
1377
1378	add	$16, %rcx
1379	movdqa	%xmm4, %xmm3
1380
1381	add	$16, %r10
1382	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
1383
1384	movdqa	(%rsi, %rcx), %xmm1
1385	movdqa	(%rdi, %rcx), %xmm2
1386	movdqa	%xmm2, %xmm4
1387
1388#ifndef USE_SSSE3
1389	psrldq	$9, %xmm3
1390	pslldq	$7, %xmm2
1391	por	%xmm3, %xmm2		/* merge into one 16byte value */
1392#else
1393	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
1394#endif
1395	TOLOWER (%xmm1, %xmm2)
1396
1397	pcmpeqb	%xmm1, %xmm0
1398	pcmpeqb	%xmm2, %xmm1
1399	psubb	%xmm0, %xmm1
1400	pmovmskb %xmm1, %edx
1401	sub	$0xffff, %edx
1402	jnz	LABEL(exit)
1403
1404#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1405	sub	$16, %r11
1406	jbe	LABEL(strcmp_exitz)
1407#endif
1408
1409	add	$16, %rcx
1410	movdqa	%xmm4, %xmm3		/* store for next cycle */
1411	jmp	LABEL(loop_ashr_9)
1412
1413	.p2align 4
1414LABEL(nibble_ashr_9):
1415	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1416	pmovmskb %xmm0, %edx
1417	test	$0xfe00, %edx
1418	jnz	LABEL(ashr_9_exittail)
1419
1420#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1421	cmp	$7, %r11
1422	jbe	LABEL(ashr_9_exittail)
1423#endif
1424
1425	pxor	%xmm0, %xmm0
1426	sub	$0x1000, %r10
1427	jmp	LABEL(gobble_ashr_9)
1428
1429	.p2align 4
1430LABEL(ashr_9_exittail):
1431	movdqa	(%rsi, %rcx), %xmm1
1432	psrldq	$9, %xmm0
1433	psrldq	$9, %xmm3
1434	jmp	LABEL(aftertail)
1435
1436/*
1437 *  The following cases will be handled by ashr_10
1438 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1439 *        n(6~15)          n - 6                9(15 +(n - 6) - n)         ashr_10
1440 */
1441	.p2align 4
1442LABEL(ashr_10):
1443	pxor	%xmm0, %xmm0
1444	movdqa	(%rdi), %xmm2
1445	movdqa	(%rsi), %xmm1
1446	pcmpeqb	%xmm1, %xmm0
1447	pslldq	$6, %xmm2
1448	TOLOWER (%xmm1, %xmm2)
1449	pcmpeqb	%xmm1, %xmm2
1450	psubb	%xmm0, %xmm2
1451	pmovmskb %xmm2, %r9d
1452	shr	%cl, %edx
1453	shr	%cl, %r9d
1454	sub	%r9d, %edx
1455	jnz	LABEL(less32bytes)
1456	movdqa	(%rdi), %xmm3
1457
1458	UPDATE_STRNCMP_COUNTER
1459
1460	pxor	%xmm0, %xmm0
1461	mov	$16, %rcx	/* index for loads */
1462	mov	$10, %r9d	/* byte position left over from less32bytes case */
1463	/*
1464	 * Setup %r10 value allows us to detect crossing a page boundary.
1465	 * When %r10 goes positive we have crossed a page boundary and
1466	 * need to do a nibble.
1467	 */
1468	lea	10(%rdi), %r10
1469	and	$0xfff, %r10	/* offset into 4K page */
1470	sub	$0x1000, %r10	/* subtract 4K pagesize */
1471
1472	.p2align 4
1473LABEL(loop_ashr_10):
1474	add	$16, %r10
1475	jg	LABEL(nibble_ashr_10)
1476
1477LABEL(gobble_ashr_10):
1478	movdqa	(%rsi, %rcx), %xmm1
1479	movdqa	(%rdi, %rcx), %xmm2
1480	movdqa	%xmm2, %xmm4
1481
1482#ifndef USE_SSSE3
1483	psrldq	$10, %xmm3
1484	pslldq	$6, %xmm2
1485	por	%xmm3, %xmm2		/* merge into one 16byte value */
1486#else
1487	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
1488#endif
1489	TOLOWER (%xmm1, %xmm2)
1490
1491	pcmpeqb	%xmm1, %xmm0
1492	pcmpeqb	%xmm2, %xmm1
1493	psubb	%xmm0, %xmm1
1494	pmovmskb %xmm1, %edx
1495	sub	$0xffff, %edx
1496	jnz	LABEL(exit)
1497
1498#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1499	sub	$16, %r11
1500	jbe	LABEL(strcmp_exitz)
1501#endif
1502
1503	add	$16, %rcx
1504	movdqa	%xmm4, %xmm3
1505
1506	add	$16, %r10
1507	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
1508
1509	movdqa	(%rsi, %rcx), %xmm1
1510	movdqa	(%rdi, %rcx), %xmm2
1511	movdqa	%xmm2, %xmm4
1512
1513#ifndef USE_SSSE3
1514	psrldq	$10, %xmm3
1515	pslldq	$6, %xmm2
1516	por	%xmm3, %xmm2		/* merge into one 16byte value */
1517#else
1518	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
1519#endif
1520	TOLOWER (%xmm1, %xmm2)
1521
1522	pcmpeqb	%xmm1, %xmm0
1523	pcmpeqb	%xmm2, %xmm1
1524	psubb	%xmm0, %xmm1
1525	pmovmskb %xmm1, %edx
1526	sub	$0xffff, %edx
1527	jnz	LABEL(exit)
1528
1529#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1530	sub	$16, %r11
1531	jbe	LABEL(strcmp_exitz)
1532#endif
1533
1534	add	$16, %rcx
1535	movdqa	%xmm4, %xmm3
1536	jmp	LABEL(loop_ashr_10)
1537
1538	.p2align 4
1539LABEL(nibble_ashr_10):
1540	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1541	pmovmskb %xmm0, %edx
1542	test	$0xfc00, %edx
1543	jnz	LABEL(ashr_10_exittail)
1544
1545#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1546	cmp	$6, %r11
1547	jbe	LABEL(ashr_10_exittail)
1548#endif
1549
1550	pxor	%xmm0, %xmm0
1551	sub	$0x1000, %r10
1552	jmp	LABEL(gobble_ashr_10)
1553
1554	.p2align 4
1555LABEL(ashr_10_exittail):
1556	movdqa	(%rsi, %rcx), %xmm1
1557	psrldq	$10, %xmm0
1558	psrldq	$10, %xmm3
1559	jmp	LABEL(aftertail)
1560
1561/*
1562 *  The following cases will be handled by ashr_11
1563 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1564 *        n(5~15)          n - 5               10(15 +(n - 5) - n)         ashr_11
1565 */
1566	.p2align 4
1567LABEL(ashr_11):
1568	pxor	%xmm0, %xmm0
1569	movdqa	(%rdi), %xmm2
1570	movdqa	(%rsi), %xmm1
1571	pcmpeqb	%xmm1, %xmm0
1572	pslldq	$5, %xmm2
1573	TOLOWER (%xmm1, %xmm2)
1574	pcmpeqb	%xmm1, %xmm2
1575	psubb	%xmm0, %xmm2
1576	pmovmskb %xmm2, %r9d
1577	shr	%cl, %edx
1578	shr	%cl, %r9d
1579	sub	%r9d, %edx
1580	jnz	LABEL(less32bytes)
1581	movdqa	(%rdi), %xmm3
1582
1583	UPDATE_STRNCMP_COUNTER
1584
1585	pxor	%xmm0, %xmm0
1586	mov	$16, %rcx	/* index for loads */
1587	mov	$11, %r9d	/* byte position left over from less32bytes case */
1588	/*
1589	 * Setup %r10 value allows us to detect crossing a page boundary.
1590	 * When %r10 goes positive we have crossed a page boundary and
1591	 * need to do a nibble.
1592	 */
1593	lea	11(%rdi), %r10
1594	and	$0xfff, %r10	/* offset into 4K page */
1595	sub	$0x1000, %r10	/* subtract 4K pagesize */
1596
1597	.p2align 4
1598LABEL(loop_ashr_11):
1599	add	$16, %r10
1600	jg	LABEL(nibble_ashr_11)
1601
1602LABEL(gobble_ashr_11):
1603	movdqa	(%rsi, %rcx), %xmm1
1604	movdqa	(%rdi, %rcx), %xmm2
1605	movdqa	%xmm2, %xmm4
1606
1607#ifndef USE_SSSE3
1608	psrldq	$11, %xmm3
1609	pslldq	$5, %xmm2
1610	por	%xmm3, %xmm2		/* merge into one 16byte value */
1611#else
1612	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
1613#endif
1614	TOLOWER (%xmm1, %xmm2)
1615
1616	pcmpeqb	%xmm1, %xmm0
1617	pcmpeqb	%xmm2, %xmm1
1618	psubb	%xmm0, %xmm1
1619	pmovmskb %xmm1, %edx
1620	sub	$0xffff, %edx
1621	jnz	LABEL(exit)
1622
1623#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1624	sub	$16, %r11
1625	jbe	LABEL(strcmp_exitz)
1626#endif
1627
1628	add	$16, %rcx
1629	movdqa	%xmm4, %xmm3
1630
1631	add	$16, %r10
1632	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
1633
1634	movdqa	(%rsi, %rcx), %xmm1
1635	movdqa	(%rdi, %rcx), %xmm2
1636	movdqa	%xmm2, %xmm4
1637
1638#ifndef USE_SSSE3
1639	psrldq	$11, %xmm3
1640	pslldq	$5, %xmm2
1641	por	%xmm3, %xmm2		/* merge into one 16byte value */
1642#else
1643	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
1644#endif
1645	TOLOWER (%xmm1, %xmm2)
1646
1647	pcmpeqb	%xmm1, %xmm0
1648	pcmpeqb	%xmm2, %xmm1
1649	psubb	%xmm0, %xmm1
1650	pmovmskb %xmm1, %edx
1651	sub	$0xffff, %edx
1652	jnz	LABEL(exit)
1653
1654#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1655	sub	$16, %r11
1656	jbe	LABEL(strcmp_exitz)
1657#endif
1658
1659	add	$16, %rcx
1660	movdqa	%xmm4, %xmm3
1661	jmp	LABEL(loop_ashr_11)
1662
1663	.p2align 4
1664LABEL(nibble_ashr_11):
1665	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1666	pmovmskb %xmm0, %edx
1667	test	$0xf800, %edx
1668	jnz	LABEL(ashr_11_exittail)
1669
1670#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1671	cmp	$5, %r11
1672	jbe	LABEL(ashr_11_exittail)
1673#endif
1674
1675	pxor	%xmm0, %xmm0
1676	sub	$0x1000, %r10
1677	jmp	LABEL(gobble_ashr_11)
1678
1679	.p2align 4
1680LABEL(ashr_11_exittail):
1681	movdqa	(%rsi, %rcx), %xmm1
1682	psrldq	$11, %xmm0
1683	psrldq	$11, %xmm3
1684	jmp	LABEL(aftertail)
1685
1686/*
1687 *  The following cases will be handled by ashr_12
1688 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1689 *        n(4~15)          n - 4                11(15 +(n - 4) - n)         ashr_12
1690 */
1691	.p2align 4
1692LABEL(ashr_12):
1693	pxor	%xmm0, %xmm0
1694	movdqa	(%rdi), %xmm2
1695	movdqa	(%rsi), %xmm1
1696	pcmpeqb	%xmm1, %xmm0
1697	pslldq	$4, %xmm2
1698	TOLOWER (%xmm1, %xmm2)
1699	pcmpeqb	%xmm1, %xmm2
1700	psubb	%xmm0, %xmm2
1701	pmovmskb %xmm2, %r9d
1702	shr	%cl, %edx
1703	shr	%cl, %r9d
1704	sub	%r9d, %edx
1705	jnz	LABEL(less32bytes)
1706	movdqa	(%rdi), %xmm3
1707
1708	UPDATE_STRNCMP_COUNTER
1709
1710	pxor	%xmm0, %xmm0
1711	mov	$16, %rcx	/* index for loads */
1712	mov	$12, %r9d	/* byte position left over from less32bytes case */
1713	/*
1714	 * Setup %r10 value allows us to detect crossing a page boundary.
1715	 * When %r10 goes positive we have crossed a page boundary and
1716	 * need to do a nibble.
1717	 */
1718	lea	12(%rdi), %r10
1719	and	$0xfff, %r10	/* offset into 4K page */
1720	sub	$0x1000, %r10	/* subtract 4K pagesize */
1721
1722	.p2align 4
1723LABEL(loop_ashr_12):
1724	add	$16, %r10
1725	jg	LABEL(nibble_ashr_12)
1726
1727LABEL(gobble_ashr_12):
1728	movdqa	(%rsi, %rcx), %xmm1
1729	movdqa	(%rdi, %rcx), %xmm2
1730	movdqa	%xmm2, %xmm4
1731
1732#ifndef USE_SSSE3
1733	psrldq	$12, %xmm3
1734	pslldq	$4, %xmm2
1735	por	%xmm3, %xmm2		/* merge into one 16byte value */
1736#else
1737	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
1738#endif
1739	TOLOWER (%xmm1, %xmm2)
1740
1741	pcmpeqb	%xmm1, %xmm0
1742	pcmpeqb	%xmm2, %xmm1
1743	psubb	%xmm0, %xmm1
1744	pmovmskb %xmm1, %edx
1745	sub	$0xffff, %edx
1746	jnz	LABEL(exit)
1747
1748#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1749	sub	$16, %r11
1750	jbe	LABEL(strcmp_exitz)
1751#endif
1752
1753	add	$16, %rcx
1754	movdqa	%xmm4, %xmm3
1755
1756	add	$16, %r10
1757	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
1758
1759	movdqa	(%rsi, %rcx), %xmm1
1760	movdqa	(%rdi, %rcx), %xmm2
1761	movdqa	%xmm2, %xmm4
1762
1763#ifndef USE_SSSE3
1764	psrldq	$12, %xmm3
1765	pslldq	$4, %xmm2
1766	por	%xmm3, %xmm2		/* merge into one 16byte value */
1767#else
1768	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
1769#endif
1770	TOLOWER (%xmm1, %xmm2)
1771
1772	pcmpeqb	%xmm1, %xmm0
1773	pcmpeqb	%xmm2, %xmm1
1774	psubb	%xmm0, %xmm1
1775	pmovmskb %xmm1, %edx
1776	sub	$0xffff, %edx
1777	jnz	LABEL(exit)
1778
1779#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1780	sub	$16, %r11
1781	jbe	LABEL(strcmp_exitz)
1782#endif
1783
1784	add	$16, %rcx
1785	movdqa	%xmm4, %xmm3
1786	jmp	LABEL(loop_ashr_12)
1787
1788	.p2align 4
1789LABEL(nibble_ashr_12):
1790	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1791	pmovmskb %xmm0, %edx
1792	test	$0xf000, %edx
1793	jnz	LABEL(ashr_12_exittail)
1794
1795#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1796	cmp	$4, %r11
1797	jbe	LABEL(ashr_12_exittail)
1798#endif
1799
1800	pxor	%xmm0, %xmm0
1801	sub	$0x1000, %r10
1802	jmp	LABEL(gobble_ashr_12)
1803
1804	.p2align 4
1805LABEL(ashr_12_exittail):
1806	movdqa	(%rsi, %rcx), %xmm1
1807	psrldq	$12, %xmm0
1808	psrldq	$12, %xmm3
1809	jmp	LABEL(aftertail)
1810
1811/*
1812 *  The following cases will be handled by ashr_13
1813 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1814 *        n(3~15)          n - 3                12(15 +(n - 3) - n)         ashr_13
1815 */
1816	.p2align 4
1817LABEL(ashr_13):
1818	pxor	%xmm0, %xmm0
1819	movdqa	(%rdi), %xmm2
1820	movdqa	(%rsi), %xmm1
1821	pcmpeqb	%xmm1, %xmm0
1822	pslldq	$3, %xmm2
1823	TOLOWER (%xmm1, %xmm2)
1824	pcmpeqb	%xmm1, %xmm2
1825	psubb	%xmm0, %xmm2
1826	pmovmskb %xmm2, %r9d
1827	shr	%cl, %edx
1828	shr	%cl, %r9d
1829	sub	%r9d, %edx
1830	jnz	LABEL(less32bytes)
1831	movdqa	(%rdi), %xmm3
1832
1833	UPDATE_STRNCMP_COUNTER
1834
1835	pxor	%xmm0, %xmm0
1836	mov	$16, %rcx	/* index for loads */
1837	mov	$13, %r9d	/* byte position left over from less32bytes case */
1838	/*
1839	 * Setup %r10 value allows us to detect crossing a page boundary.
1840	 * When %r10 goes positive we have crossed a page boundary and
1841	 * need to do a nibble.
1842	 */
1843	lea	13(%rdi), %r10
1844	and	$0xfff, %r10	/* offset into 4K page */
1845	sub	$0x1000, %r10	/* subtract 4K pagesize */
1846
1847	.p2align 4
1848LABEL(loop_ashr_13):
1849	add	$16, %r10
1850	jg	LABEL(nibble_ashr_13)
1851
1852LABEL(gobble_ashr_13):
1853	movdqa	(%rsi, %rcx), %xmm1
1854	movdqa	(%rdi, %rcx), %xmm2
1855	movdqa	%xmm2, %xmm4
1856
1857#ifndef USE_SSSE3
1858	psrldq	$13, %xmm3
1859	pslldq	$3, %xmm2
1860	por	%xmm3, %xmm2		/* merge into one 16byte value */
1861#else
1862	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
1863#endif
1864	TOLOWER (%xmm1, %xmm2)
1865
1866	pcmpeqb	%xmm1, %xmm0
1867	pcmpeqb	%xmm2, %xmm1
1868	psubb	%xmm0, %xmm1
1869	pmovmskb %xmm1, %edx
1870	sub	$0xffff, %edx
1871	jnz	LABEL(exit)
1872
1873#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1874	sub	$16, %r11
1875	jbe	LABEL(strcmp_exitz)
1876#endif
1877
1878	add	$16, %rcx
1879	movdqa	%xmm4, %xmm3
1880
1881	add	$16, %r10
1882	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
1883
1884	movdqa	(%rsi, %rcx), %xmm1
1885	movdqa	(%rdi, %rcx), %xmm2
1886	movdqa	%xmm2, %xmm4
1887
1888#ifndef USE_SSSE3
1889	psrldq	$13, %xmm3
1890	pslldq	$3, %xmm2
1891	por	%xmm3, %xmm2		/* merge into one 16byte value */
1892#else
1893	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
1894#endif
1895	TOLOWER (%xmm1, %xmm2)
1896
1897	pcmpeqb	%xmm1, %xmm0
1898	pcmpeqb	%xmm2, %xmm1
1899	psubb	%xmm0, %xmm1
1900	pmovmskb %xmm1, %edx
1901	sub	$0xffff, %edx
1902	jnz	LABEL(exit)
1903
1904#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1905	sub	$16, %r11
1906	jbe	LABEL(strcmp_exitz)
1907#endif
1908
1909	add	$16, %rcx
1910	movdqa	%xmm4, %xmm3
1911	jmp	LABEL(loop_ashr_13)
1912
1913	.p2align 4
1914LABEL(nibble_ashr_13):
1915	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1916	pmovmskb %xmm0, %edx
1917	test	$0xe000, %edx
1918	jnz	LABEL(ashr_13_exittail)
1919
1920#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1921	cmp	$3, %r11
1922	jbe	LABEL(ashr_13_exittail)
1923#endif
1924
1925	pxor	%xmm0, %xmm0
1926	sub	$0x1000, %r10
1927	jmp	LABEL(gobble_ashr_13)
1928
1929	.p2align 4
1930LABEL(ashr_13_exittail):
1931	movdqa	(%rsi, %rcx), %xmm1
1932	psrldq  $13, %xmm0
1933	psrldq  $13, %xmm3
1934	jmp	LABEL(aftertail)
1935
1936/*
1937 *  The following cases will be handled by ashr_14
1938 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1939 *        n(2~15)          n - 2                13(15 +(n - 2) - n)         ashr_14
1940 */
1941	.p2align 4
1942LABEL(ashr_14):
1943	pxor	%xmm0, %xmm0
1944	movdqa	(%rdi), %xmm2
1945	movdqa	(%rsi), %xmm1
1946	pcmpeqb	%xmm1, %xmm0
1947	pslldq  $2, %xmm2
1948	TOLOWER (%xmm1, %xmm2)
1949	pcmpeqb	%xmm1, %xmm2
1950	psubb	%xmm0, %xmm2
1951	pmovmskb %xmm2, %r9d
1952	shr	%cl, %edx
1953	shr	%cl, %r9d
1954	sub	%r9d, %edx
1955	jnz	LABEL(less32bytes)
1956	movdqa	(%rdi), %xmm3
1957
1958	UPDATE_STRNCMP_COUNTER
1959
1960	pxor	%xmm0, %xmm0
1961	mov	$16, %rcx	/* index for loads */
1962	mov	$14, %r9d	/* byte position left over from less32bytes case */
1963	/*
1964	 * Setup %r10 value allows us to detect crossing a page boundary.
1965	 * When %r10 goes positive we have crossed a page boundary and
1966	 * need to do a nibble.
1967	 */
1968	lea	14(%rdi), %r10
1969	and	$0xfff, %r10	/* offset into 4K page */
1970	sub	$0x1000, %r10	/* subtract 4K pagesize */
1971
1972	.p2align 4
1973LABEL(loop_ashr_14):
1974	add	$16, %r10
1975	jg	LABEL(nibble_ashr_14)
1976
1977LABEL(gobble_ashr_14):
1978	movdqa	(%rsi, %rcx), %xmm1
1979	movdqa	(%rdi, %rcx), %xmm2
1980	movdqa	%xmm2, %xmm4
1981
1982#ifndef USE_SSSE3
1983	psrldq	$14, %xmm3
1984	pslldq	$2, %xmm2
1985	por	%xmm3, %xmm2		/* merge into one 16byte value */
1986#else
1987	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
1988#endif
1989	TOLOWER (%xmm1, %xmm2)
1990
1991	pcmpeqb	%xmm1, %xmm0
1992	pcmpeqb	%xmm2, %xmm1
1993	psubb	%xmm0, %xmm1
1994	pmovmskb %xmm1, %edx
1995	sub	$0xffff, %edx
1996	jnz	LABEL(exit)
1997
1998#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1999	sub	$16, %r11
2000	jbe	LABEL(strcmp_exitz)
2001#endif
2002
2003	add	$16, %rcx
2004	movdqa	%xmm4, %xmm3
2005
2006	add	$16, %r10
2007	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
2008
2009	movdqa	(%rsi, %rcx), %xmm1
2010	movdqa	(%rdi, %rcx), %xmm2
2011	movdqa	%xmm2, %xmm4
2012
2013#ifndef USE_SSSE3
2014	psrldq	$14, %xmm3
2015	pslldq	$2, %xmm2
2016	por	%xmm3, %xmm2		/* merge into one 16byte value */
2017#else
2018	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
2019#endif
2020	TOLOWER (%xmm1, %xmm2)
2021
2022	pcmpeqb	%xmm1, %xmm0
2023	pcmpeqb	%xmm2, %xmm1
2024	psubb	%xmm0, %xmm1
2025	pmovmskb %xmm1, %edx
2026	sub	$0xffff, %edx
2027	jnz	LABEL(exit)
2028
2029#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
2030	sub	$16, %r11
2031	jbe	LABEL(strcmp_exitz)
2032#endif
2033
2034	add	$16, %rcx
2035	movdqa	%xmm4, %xmm3
2036	jmp	LABEL(loop_ashr_14)
2037
2038	.p2align 4
2039LABEL(nibble_ashr_14):
2040	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
2041	pmovmskb %xmm0, %edx
2042	test	$0xc000, %edx
2043	jnz	LABEL(ashr_14_exittail)
2044
2045#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2046	cmp	$2, %r11
2047	jbe	LABEL(ashr_14_exittail)
2048#endif
2049
2050	pxor	%xmm0, %xmm0
2051	sub	$0x1000, %r10
2052	jmp	LABEL(gobble_ashr_14)
2053
2054	.p2align 4
2055LABEL(ashr_14_exittail):
2056	movdqa	(%rsi, %rcx), %xmm1
2057	psrldq	$14, %xmm0
2058	psrldq	$14, %xmm3
2059	jmp	LABEL(aftertail)
2060
2061/*
2062 *  The following cases will be handled by ashr_15
2063 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
2064 *        n(1~15)          n - 1                14(15 +(n - 1) - n)         ashr_15
2065 */
2066	.p2align 4
2067LABEL(ashr_15):
2068	pxor	%xmm0, %xmm0
2069	movdqa	(%rdi), %xmm2
2070	movdqa	(%rsi), %xmm1
2071	pcmpeqb	%xmm1, %xmm0
2072	pslldq	$1, %xmm2
2073	TOLOWER (%xmm1, %xmm2)
2074	pcmpeqb	%xmm1, %xmm2
2075	psubb	%xmm0, %xmm2
2076	pmovmskb %xmm2, %r9d
2077	shr	%cl, %edx
2078	shr	%cl, %r9d
2079	sub	%r9d, %edx
2080	jnz	LABEL(less32bytes)
2081
2082	movdqa	(%rdi), %xmm3
2083
2084	UPDATE_STRNCMP_COUNTER
2085
2086	pxor	%xmm0, %xmm0
2087	mov	$16, %rcx	/* index for loads */
2088	mov	$15, %r9d	/* byte position left over from less32bytes case */
2089	/*
2090	 * Setup %r10 value allows us to detect crossing a page boundary.
2091	 * When %r10 goes positive we have crossed a page boundary and
2092	 * need to do a nibble.
2093	 */
2094	lea	15(%rdi), %r10
2095	and	$0xfff, %r10	/* offset into 4K page */
2096
2097	sub	$0x1000, %r10	/* subtract 4K pagesize */
2098
2099	.p2align 4
2100LABEL(loop_ashr_15):
2101	add	$16, %r10
2102	jg	LABEL(nibble_ashr_15)
2103
2104LABEL(gobble_ashr_15):
2105	movdqa	(%rsi, %rcx), %xmm1
2106	movdqa	(%rdi, %rcx), %xmm2
2107	movdqa	%xmm2, %xmm4
2108
2109#ifndef USE_SSSE3
2110	psrldq	$15, %xmm3
2111	pslldq	$1, %xmm2
2112	por	%xmm3, %xmm2		/* merge into one 16byte value */
2113#else
2114	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
2115#endif
2116	TOLOWER (%xmm1, %xmm2)
2117
2118	pcmpeqb	%xmm1, %xmm0
2119	pcmpeqb	%xmm2, %xmm1
2120	psubb	%xmm0, %xmm1
2121	pmovmskb %xmm1, %edx
2122	sub	$0xffff, %edx
2123	jnz	LABEL(exit)
2124
2125#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2126	sub	$16, %r11
2127	jbe	LABEL(strcmp_exitz)
2128#endif
2129
2130	add	$16, %rcx
2131	movdqa	%xmm4, %xmm3
2132
2133	add	$16, %r10
2134	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
2135
2136	movdqa	(%rsi, %rcx), %xmm1
2137	movdqa	(%rdi, %rcx), %xmm2
2138	movdqa	%xmm2, %xmm4
2139
2140#ifndef USE_SSSE3
2141	psrldq	$15, %xmm3
2142	pslldq	$1, %xmm2
2143	por	%xmm3, %xmm2		/* merge into one 16byte value */
2144#else
2145	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
2146#endif
2147	TOLOWER (%xmm1, %xmm2)
2148
2149	pcmpeqb	%xmm1, %xmm0
2150	pcmpeqb	%xmm2, %xmm1
2151	psubb	%xmm0, %xmm1
2152	pmovmskb %xmm1, %edx
2153	sub	$0xffff, %edx
2154	jnz	LABEL(exit)
2155
2156#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2157	sub	$16, %r11
2158	jbe	LABEL(strcmp_exitz)
2159#endif
2160
2161	add	$16, %rcx
2162	movdqa	%xmm4, %xmm3
2163	jmp	LABEL(loop_ashr_15)
2164
2165	.p2align 4
2166LABEL(nibble_ashr_15):
2167	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
2168	pmovmskb %xmm0, %edx
2169	test	$0x8000, %edx
2170	jnz	LABEL(ashr_15_exittail)
2171
2172#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2173	cmpq	$1, %r11
2174	jbe	LABEL(ashr_15_exittail)
2175#endif
2176
2177	pxor	%xmm0, %xmm0
2178	sub	$0x1000, %r10
2179	jmp	LABEL(gobble_ashr_15)
2180
2181	.p2align 4
2182LABEL(ashr_15_exittail):
2183	movdqa	(%rsi, %rcx), %xmm1
2184	psrldq	$15, %xmm3
2185	psrldq	$15, %xmm0
2186
2187	.p2align 4
2188LABEL(aftertail):
2189	TOLOWER (%xmm1, %xmm3)
2190	pcmpeqb	%xmm3, %xmm1
2191	psubb	%xmm0, %xmm1
2192	pmovmskb %xmm1, %edx
2193	not	%edx
2194
2195	.p2align 4
2196LABEL(exit):
2197	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
2198LABEL(less32bytes):
2199	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
2200	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
2201	test	%r8d, %r8d
2202	jz	LABEL(ret)
2203	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
2204
2205	.p2align 4
2206LABEL(ret):
2207LABEL(less16bytes):
2208	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
2209
2210#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2211	sub	%rdx, %r11
2212	jbe	LABEL(strcmp_exitz)
2213#endif
2214	movzbl	(%rsi, %rdx), %ecx
2215	movzbl	(%rdi, %rdx), %eax
2216
2217#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2218	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2219	movl	(%rdx,%rcx,4), %ecx
2220	movl	(%rdx,%rax,4), %eax
2221#endif
2222
2223	sub	%ecx, %eax
2224	ret
2225
2226LABEL(strcmp_exitz):
2227	xor	%eax, %eax
2228	ret
2229
2230	.p2align 4
2231LABEL(Byte0):
2232	movzbl	(%rsi), %ecx
2233	movzbl	(%rdi), %eax
2234
2235#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2236	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2237	movl	(%rdx,%rcx,4), %ecx
2238	movl	(%rdx,%rax,4), %eax
2239#endif
2240
2241	sub	%ecx, %eax
2242	ret
2243END (STRCMP)
2244
2245	.section .rodata,"a",@progbits
2246	.p2align 3
2247LABEL(unaligned_table):
2248	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2249	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2250	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2251	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2252	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2253	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2254	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2255	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2256	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2257	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2258	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2259	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2260	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2261	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2262	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2263	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2264libc_hidden_builtin_def (STRCMP)
2265