1/* memcmp with SSSE3, wmemcmp with SSSE3
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24#  define MEMCMP	__memcmp_ssse3
25# endif
26
27/* Warning!
28	   wmemcmp has to use SIGNED comparison for elements.
29	   memcmp has to use UNSIGNED comparison for elemnts.
30*/
31
32	atom_text_section
33ENTRY (MEMCMP)
34# ifdef USE_AS_WMEMCMP
35	shl	$2, %RDX_LP
36	test	%RDX_LP, %RDX_LP
37	jz	L(equal)
38# elif defined __ILP32__
39	/* Clear the upper 32 bits.  */
40	mov	%edx, %edx
41# endif
42	mov	%rdx, %rcx
43	mov	%rdi, %rdx
44	cmp	$48, %rcx;
45	jae	L(48bytesormore)	/* LEN => 48  */
46
47	add	%rcx, %rsi
48	add	%rcx, %rdi
49	jmp	L(less48bytes)
50
51	.p2align 4
52/* ECX >= 32.  */
53L(48bytesormore):
54	movdqu	(%rdi), %xmm3
55	movdqu	(%rsi), %xmm0
56	pcmpeqb	%xmm0, %xmm3
57	pmovmskb %xmm3, %edx
58	lea	16(%rdi), %rdi
59	lea	16(%rsi), %rsi
60	sub	$0xffff, %edx
61	jnz	L(less16bytes)
62	mov	%edi, %edx
63	and	$0xf, %edx
64	xor	%rdx, %rdi
65	sub	%rdx, %rsi
66	add	%rdx, %rcx
67	mov	%esi, %edx
68	and	$0xf, %edx
69	jz	L(shr_0)
70	xor	%rdx, %rsi
71
72# ifndef USE_AS_WMEMCMP
73	cmp	$8, %edx
74	jae	L(next_unaligned_table)
75	cmp	$0, %edx
76	je	L(shr_0)
77	cmp	$1, %edx
78	je	L(shr_1)
79	cmp	$2, %edx
80	je	L(shr_2)
81	cmp	$3, %edx
82	je	L(shr_3)
83	cmp	$4, %edx
84	je	L(shr_4)
85	cmp	$5, %edx
86	je	L(shr_5)
87	cmp	$6, %edx
88	je	L(shr_6)
89	jmp	L(shr_7)
90
91	.p2align 2
92L(next_unaligned_table):
93	cmp	$8, %edx
94	je	L(shr_8)
95	cmp	$9, %edx
96	je	L(shr_9)
97	cmp	$10, %edx
98	je	L(shr_10)
99	cmp	$11, %edx
100	je	L(shr_11)
101	cmp	$12, %edx
102	je	L(shr_12)
103	cmp	$13, %edx
104	je	L(shr_13)
105	cmp	$14, %edx
106	je	L(shr_14)
107	jmp	L(shr_15)
108# else
109	cmp	$0, %edx
110	je	L(shr_0)
111	cmp	$4, %edx
112	je	L(shr_4)
113	cmp	$8, %edx
114	je	L(shr_8)
115	jmp	L(shr_12)
116# endif
117
118	.p2align 4
119L(shr_0):
120	cmp	$80, %rcx
121	lea	-48(%rcx), %rcx
122	jae	L(shr_0_gobble)
123	xor	%eax, %eax
124	movdqa	(%rsi), %xmm1
125	pcmpeqb	(%rdi), %xmm1
126	movdqa	16(%rsi), %xmm2
127	pcmpeqb	16(%rdi), %xmm2
128	pand	%xmm1, %xmm2
129	pmovmskb %xmm2, %edx
130	lea	32(%rdi), %rdi
131	lea	32(%rsi), %rsi
132	sub	$0xffff, %edx
133	jnz	L(exit)
134	add	%rcx, %rsi
135	add	%rcx, %rdi
136	jmp	L(less48bytes)
137
138	.p2align 4
139L(shr_0_gobble):
140	movdqa	(%rsi), %xmm0
141	xor	%eax, %eax
142	pcmpeqb	(%rdi), %xmm0
143	sub	$32, %rcx
144	movdqa	16(%rsi), %xmm2
145	pcmpeqb	16(%rdi), %xmm2
146L(shr_0_gobble_loop):
147	pand	%xmm0, %xmm2
148	sub	$32, %rcx
149	pmovmskb %xmm2, %edx
150	movdqa	%xmm0, %xmm1
151	movdqa	32(%rsi), %xmm0
152	movdqa	48(%rsi), %xmm2
153	sbb	$0xffff, %edx
154	pcmpeqb	32(%rdi), %xmm0
155	pcmpeqb	48(%rdi), %xmm2
156	lea	32(%rdi), %rdi
157	lea	32(%rsi), %rsi
158	jz	L(shr_0_gobble_loop)
159
160	pand	%xmm0, %xmm2
161	cmp	$0, %rcx
162	jge	L(next)
163	inc	%edx
164	add	$32, %rcx
165L(next):
166	test	%edx, %edx
167	jnz	L(exit)
168
169	pmovmskb %xmm2, %edx
170	movdqa	%xmm0, %xmm1
171	lea	32(%rdi), %rdi
172	lea	32(%rsi), %rsi
173	sub	$0xffff, %edx
174	jnz	L(exit)
175	add	%rcx, %rsi
176	add	%rcx, %rdi
177	jmp	L(less48bytes)
178
179# ifndef USE_AS_WMEMCMP
180
181	.p2align 4
182L(shr_1):
183	cmp	$80, %rcx
184	lea	-48(%rcx), %rcx
185	mov	%edx, %eax
186	jae	L(shr_1_gobble)
187
188	movdqa	16(%rsi), %xmm1
189	movdqa	%xmm1, %xmm2
190	palignr	$1, (%rsi), %xmm1
191	pcmpeqb	(%rdi), %xmm1
192
193	movdqa	32(%rsi), %xmm3
194	palignr	$1, %xmm2, %xmm3
195	pcmpeqb	16(%rdi), %xmm3
196
197	pand	%xmm1, %xmm3
198	pmovmskb %xmm3, %edx
199	lea	32(%rdi), %rdi
200	lea	32(%rsi), %rsi
201	sub	$0xffff, %edx
202	jnz	L(exit)
203	add	$1, %rsi
204	add	%rcx, %rsi
205	add	%rcx, %rdi
206	jmp	L(less48bytes)
207
208	.p2align 4
209L(shr_1_gobble):
210	sub	$32, %rcx
211	movdqa	16(%rsi), %xmm0
212	palignr	$1, (%rsi), %xmm0
213	pcmpeqb	(%rdi), %xmm0
214
215	movdqa	32(%rsi), %xmm3
216	palignr	$1, 16(%rsi), %xmm3
217	pcmpeqb	16(%rdi), %xmm3
218
219L(shr_1_gobble_loop):
220	pand	%xmm0, %xmm3
221	sub	$32, %rcx
222	pmovmskb %xmm3, %edx
223	movdqa	%xmm0, %xmm1
224
225	movdqa	64(%rsi), %xmm3
226	palignr	$1, 48(%rsi), %xmm3
227	sbb	$0xffff, %edx
228	movdqa	48(%rsi), %xmm0
229	palignr	$1, 32(%rsi), %xmm0
230	pcmpeqb	32(%rdi), %xmm0
231	lea	32(%rsi), %rsi
232	pcmpeqb	48(%rdi), %xmm3
233
234	lea	32(%rdi), %rdi
235	jz	L(shr_1_gobble_loop)
236	pand	%xmm0, %xmm3
237
238	cmp	$0, %rcx
239	jge	L(shr_1_gobble_next)
240	inc	%edx
241	add	$32, %rcx
242L(shr_1_gobble_next):
243	test	%edx, %edx
244	jnz	L(exit)
245
246	pmovmskb %xmm3, %edx
247	movdqa	%xmm0, %xmm1
248	lea	32(%rdi), %rdi
249	lea	32(%rsi), %rsi
250	sub	$0xffff, %edx
251	jnz	L(exit)
252
253	lea	1(%rsi), %rsi
254	add	%rcx, %rsi
255	add	%rcx, %rdi
256	jmp	L(less48bytes)
257
258
259	.p2align 4
260L(shr_2):
261	cmp	$80, %rcx
262	lea	-48(%rcx), %rcx
263	mov	%edx, %eax
264	jae	L(shr_2_gobble)
265
266	movdqa	16(%rsi), %xmm1
267	movdqa	%xmm1, %xmm2
268	palignr	$2, (%rsi), %xmm1
269	pcmpeqb	(%rdi), %xmm1
270
271	movdqa	32(%rsi), %xmm3
272	palignr	$2, %xmm2, %xmm3
273	pcmpeqb	16(%rdi), %xmm3
274
275	pand	%xmm1, %xmm3
276	pmovmskb %xmm3, %edx
277	lea	32(%rdi), %rdi
278	lea	32(%rsi), %rsi
279	sub	$0xffff, %edx
280	jnz	L(exit)
281	add	$2, %rsi
282	add	%rcx, %rsi
283	add	%rcx, %rdi
284	jmp	L(less48bytes)
285
286	.p2align 4
287L(shr_2_gobble):
288	sub	$32, %rcx
289	movdqa	16(%rsi), %xmm0
290	palignr	$2, (%rsi), %xmm0
291	pcmpeqb	(%rdi), %xmm0
292
293	movdqa	32(%rsi), %xmm3
294	palignr	$2, 16(%rsi), %xmm3
295	pcmpeqb	16(%rdi), %xmm3
296
297L(shr_2_gobble_loop):
298	pand	%xmm0, %xmm3
299	sub	$32, %rcx
300	pmovmskb %xmm3, %edx
301	movdqa	%xmm0, %xmm1
302
303	movdqa	64(%rsi), %xmm3
304	palignr	$2, 48(%rsi), %xmm3
305	sbb	$0xffff, %edx
306	movdqa	48(%rsi), %xmm0
307	palignr	$2, 32(%rsi), %xmm0
308	pcmpeqb	32(%rdi), %xmm0
309	lea	32(%rsi), %rsi
310	pcmpeqb	48(%rdi), %xmm3
311
312	lea	32(%rdi), %rdi
313	jz	L(shr_2_gobble_loop)
314	pand	%xmm0, %xmm3
315
316	cmp	$0, %rcx
317	jge	L(shr_2_gobble_next)
318	inc	%edx
319	add	$32, %rcx
320L(shr_2_gobble_next):
321	test	%edx, %edx
322	jnz	L(exit)
323
324	pmovmskb %xmm3, %edx
325	movdqa	%xmm0, %xmm1
326	lea	32(%rdi), %rdi
327	lea	32(%rsi), %rsi
328	sub	$0xffff, %edx
329	jnz	L(exit)
330
331	lea	2(%rsi), %rsi
332	add	%rcx, %rsi
333	add	%rcx, %rdi
334	jmp	L(less48bytes)
335
336	.p2align 4
337L(shr_3):
338	cmp	$80, %rcx
339	lea	-48(%rcx), %rcx
340	mov	%edx, %eax
341	jae	L(shr_3_gobble)
342
343	movdqa	16(%rsi), %xmm1
344	movdqa	%xmm1, %xmm2
345	palignr	$3, (%rsi), %xmm1
346	pcmpeqb	(%rdi), %xmm1
347
348	movdqa	32(%rsi), %xmm3
349	palignr	$3, %xmm2, %xmm3
350	pcmpeqb	16(%rdi), %xmm3
351
352	pand	%xmm1, %xmm3
353	pmovmskb %xmm3, %edx
354	lea	32(%rdi), %rdi
355	lea	32(%rsi), %rsi
356	sub	$0xffff, %edx
357	jnz	L(exit)
358	add	$3, %rsi
359	add	%rcx, %rsi
360	add	%rcx, %rdi
361	jmp	L(less48bytes)
362
363	.p2align 4
364L(shr_3_gobble):
365	sub	$32, %rcx
366	movdqa	16(%rsi), %xmm0
367	palignr	$3, (%rsi), %xmm0
368	pcmpeqb	(%rdi), %xmm0
369
370	movdqa	32(%rsi), %xmm3
371	palignr	$3, 16(%rsi), %xmm3
372	pcmpeqb	16(%rdi), %xmm3
373
374L(shr_3_gobble_loop):
375	pand	%xmm0, %xmm3
376	sub	$32, %rcx
377	pmovmskb %xmm3, %edx
378	movdqa	%xmm0, %xmm1
379
380	movdqa	64(%rsi), %xmm3
381	palignr	$3, 48(%rsi), %xmm3
382	sbb	$0xffff, %edx
383	movdqa	48(%rsi), %xmm0
384	palignr	$3, 32(%rsi), %xmm0
385	pcmpeqb	32(%rdi), %xmm0
386	lea	32(%rsi), %rsi
387	pcmpeqb	48(%rdi), %xmm3
388
389	lea	32(%rdi), %rdi
390	jz	L(shr_3_gobble_loop)
391	pand	%xmm0, %xmm3
392
393	cmp	$0, %rcx
394	jge	L(shr_3_gobble_next)
395	inc	%edx
396	add	$32, %rcx
397L(shr_3_gobble_next):
398	test	%edx, %edx
399	jnz	L(exit)
400
401	pmovmskb %xmm3, %edx
402	movdqa	%xmm0, %xmm1
403	lea	32(%rdi), %rdi
404	lea	32(%rsi), %rsi
405	sub	$0xffff, %edx
406	jnz	L(exit)
407
408	lea	3(%rsi), %rsi
409	add	%rcx, %rsi
410	add	%rcx, %rdi
411	jmp	L(less48bytes)
412
413# endif
414
415	.p2align 4
416L(shr_4):
417	cmp	$80, %rcx
418	lea	-48(%rcx), %rcx
419	mov	%edx, %eax
420	jae	L(shr_4_gobble)
421
422	movdqa	16(%rsi), %xmm1
423	movdqa	%xmm1, %xmm2
424	palignr	$4, (%rsi), %xmm1
425	pcmpeqb	(%rdi), %xmm1
426
427	movdqa	32(%rsi), %xmm3
428	palignr	$4, %xmm2, %xmm3
429	pcmpeqb	16(%rdi), %xmm3
430
431	pand	%xmm1, %xmm3
432	pmovmskb %xmm3, %edx
433	lea	32(%rdi), %rdi
434	lea	32(%rsi), %rsi
435	sub	$0xffff, %edx
436	jnz	L(exit)
437	add	$4, %rsi
438	add	%rcx, %rsi
439	add	%rcx, %rdi
440	jmp	L(less48bytes)
441
442	.p2align 4
443L(shr_4_gobble):
444	sub	$32, %rcx
445	movdqa	16(%rsi), %xmm0
446	palignr	$4, (%rsi), %xmm0
447	pcmpeqb	(%rdi), %xmm0
448
449	movdqa	32(%rsi), %xmm3
450	palignr	$4, 16(%rsi), %xmm3
451	pcmpeqb	16(%rdi), %xmm3
452
453L(shr_4_gobble_loop):
454	pand	%xmm0, %xmm3
455	sub	$32, %rcx
456	pmovmskb %xmm3, %edx
457	movdqa	%xmm0, %xmm1
458
459	movdqa	64(%rsi), %xmm3
460	palignr	$4, 48(%rsi), %xmm3
461	sbb	$0xffff, %edx
462	movdqa	48(%rsi), %xmm0
463	palignr	$4, 32(%rsi), %xmm0
464	pcmpeqb	32(%rdi), %xmm0
465	lea	32(%rsi), %rsi
466	pcmpeqb	48(%rdi), %xmm3
467
468	lea	32(%rdi), %rdi
469	jz	L(shr_4_gobble_loop)
470	pand	%xmm0, %xmm3
471
472	cmp	$0, %rcx
473	jge	L(shr_4_gobble_next)
474	inc	%edx
475	add	$32, %rcx
476L(shr_4_gobble_next):
477	test	%edx, %edx
478	jnz	L(exit)
479
480	pmovmskb %xmm3, %edx
481	movdqa	%xmm0, %xmm1
482	lea	32(%rdi), %rdi
483	lea	32(%rsi), %rsi
484	sub	$0xffff, %edx
485	jnz	L(exit)
486
487	lea	4(%rsi), %rsi
488	add	%rcx, %rsi
489	add	%rcx, %rdi
490	jmp	L(less48bytes)
491
492# ifndef USE_AS_WMEMCMP
493
494	.p2align 4
495L(shr_5):
496	cmp	$80, %rcx
497	lea	-48(%rcx), %rcx
498	mov	%edx, %eax
499	jae	L(shr_5_gobble)
500
501	movdqa	16(%rsi), %xmm1
502	movdqa	%xmm1, %xmm2
503	palignr	$5, (%rsi), %xmm1
504	pcmpeqb	(%rdi), %xmm1
505
506	movdqa	32(%rsi), %xmm3
507	palignr	$5, %xmm2, %xmm3
508	pcmpeqb	16(%rdi), %xmm3
509
510	pand	%xmm1, %xmm3
511	pmovmskb %xmm3, %edx
512	lea	32(%rdi), %rdi
513	lea	32(%rsi), %rsi
514	sub	$0xffff, %edx
515	jnz	L(exit)
516	add	$5, %rsi
517	add	%rcx, %rsi
518	add	%rcx, %rdi
519	jmp	L(less48bytes)
520
521	.p2align 4
522L(shr_5_gobble):
523	sub	$32, %rcx
524	movdqa	16(%rsi), %xmm0
525	palignr	$5, (%rsi), %xmm0
526	pcmpeqb	(%rdi), %xmm0
527
528	movdqa	32(%rsi), %xmm3
529	palignr	$5, 16(%rsi), %xmm3
530	pcmpeqb	16(%rdi), %xmm3
531
532L(shr_5_gobble_loop):
533	pand	%xmm0, %xmm3
534	sub	$32, %rcx
535	pmovmskb %xmm3, %edx
536	movdqa	%xmm0, %xmm1
537
538	movdqa	64(%rsi), %xmm3
539	palignr	$5, 48(%rsi), %xmm3
540	sbb	$0xffff, %edx
541	movdqa	48(%rsi), %xmm0
542	palignr	$5, 32(%rsi), %xmm0
543	pcmpeqb	32(%rdi), %xmm0
544	lea	32(%rsi), %rsi
545	pcmpeqb	48(%rdi), %xmm3
546
547	lea	32(%rdi), %rdi
548	jz	L(shr_5_gobble_loop)
549	pand	%xmm0, %xmm3
550
551	cmp	$0, %rcx
552	jge	L(shr_5_gobble_next)
553	inc	%edx
554	add	$32, %rcx
555L(shr_5_gobble_next):
556	test	%edx, %edx
557	jnz	L(exit)
558
559	pmovmskb %xmm3, %edx
560	movdqa	%xmm0, %xmm1
561	lea	32(%rdi), %rdi
562	lea	32(%rsi), %rsi
563	sub	$0xffff, %edx
564	jnz	L(exit)
565
566	lea	5(%rsi), %rsi
567	add	%rcx, %rsi
568	add	%rcx, %rdi
569	jmp	L(less48bytes)
570
571	.p2align 4
572L(shr_6):
573	cmp	$80, %rcx
574	lea	-48(%rcx), %rcx
575	mov	%edx, %eax
576	jae	L(shr_6_gobble)
577
578	movdqa	16(%rsi), %xmm1
579	movdqa	%xmm1, %xmm2
580	palignr	$6, (%rsi), %xmm1
581	pcmpeqb	(%rdi), %xmm1
582
583	movdqa	32(%rsi), %xmm3
584	palignr	$6, %xmm2, %xmm3
585	pcmpeqb	16(%rdi), %xmm3
586
587	pand	%xmm1, %xmm3
588	pmovmskb %xmm3, %edx
589	lea	32(%rdi), %rdi
590	lea	32(%rsi), %rsi
591	sub	$0xffff, %edx
592	jnz	L(exit)
593	add	$6, %rsi
594	add	%rcx, %rsi
595	add	%rcx, %rdi
596	jmp	L(less48bytes)
597
598	.p2align 4
599L(shr_6_gobble):
600	sub	$32, %rcx
601	movdqa	16(%rsi), %xmm0
602	palignr	$6, (%rsi), %xmm0
603	pcmpeqb	(%rdi), %xmm0
604
605	movdqa	32(%rsi), %xmm3
606	palignr	$6, 16(%rsi), %xmm3
607	pcmpeqb	16(%rdi), %xmm3
608
609L(shr_6_gobble_loop):
610	pand	%xmm0, %xmm3
611	sub	$32, %rcx
612	pmovmskb %xmm3, %edx
613	movdqa	%xmm0, %xmm1
614
615	movdqa	64(%rsi), %xmm3
616	palignr	$6, 48(%rsi), %xmm3
617	sbb	$0xffff, %edx
618	movdqa	48(%rsi), %xmm0
619	palignr	$6, 32(%rsi), %xmm0
620	pcmpeqb	32(%rdi), %xmm0
621	lea	32(%rsi), %rsi
622	pcmpeqb	48(%rdi), %xmm3
623
624	lea	32(%rdi), %rdi
625	jz	L(shr_6_gobble_loop)
626	pand	%xmm0, %xmm3
627
628	cmp	$0, %rcx
629	jge	L(shr_6_gobble_next)
630	inc	%edx
631	add	$32, %rcx
632L(shr_6_gobble_next):
633	test	%edx, %edx
634	jnz	L(exit)
635
636	pmovmskb %xmm3, %edx
637	movdqa	%xmm0, %xmm1
638	lea	32(%rdi), %rdi
639	lea	32(%rsi), %rsi
640	sub	$0xffff, %edx
641	jnz	L(exit)
642
643	lea	6(%rsi), %rsi
644	add	%rcx, %rsi
645	add	%rcx, %rdi
646	jmp	L(less48bytes)
647
648	.p2align 4
649L(shr_7):
650	cmp	$80, %rcx
651	lea	-48(%rcx), %rcx
652	mov	%edx, %eax
653	jae	L(shr_7_gobble)
654
655	movdqa	16(%rsi), %xmm1
656	movdqa	%xmm1, %xmm2
657	palignr	$7, (%rsi), %xmm1
658	pcmpeqb	(%rdi), %xmm1
659
660	movdqa	32(%rsi), %xmm3
661	palignr	$7, %xmm2, %xmm3
662	pcmpeqb	16(%rdi), %xmm3
663
664	pand	%xmm1, %xmm3
665	pmovmskb %xmm3, %edx
666	lea	32(%rdi), %rdi
667	lea	32(%rsi), %rsi
668	sub	$0xffff, %edx
669	jnz	L(exit)
670	add	$7, %rsi
671	add	%rcx, %rsi
672	add	%rcx, %rdi
673	jmp	L(less48bytes)
674
675	.p2align 4
676L(shr_7_gobble):
677	sub	$32, %rcx
678	movdqa	16(%rsi), %xmm0
679	palignr	$7, (%rsi), %xmm0
680	pcmpeqb	(%rdi), %xmm0
681
682	movdqa	32(%rsi), %xmm3
683	palignr	$7, 16(%rsi), %xmm3
684	pcmpeqb	16(%rdi), %xmm3
685
686L(shr_7_gobble_loop):
687	pand	%xmm0, %xmm3
688	sub	$32, %rcx
689	pmovmskb %xmm3, %edx
690	movdqa	%xmm0, %xmm1
691
692	movdqa	64(%rsi), %xmm3
693	palignr	$7, 48(%rsi), %xmm3
694	sbb	$0xffff, %edx
695	movdqa	48(%rsi), %xmm0
696	palignr	$7, 32(%rsi), %xmm0
697	pcmpeqb	32(%rdi), %xmm0
698	lea	32(%rsi), %rsi
699	pcmpeqb	48(%rdi), %xmm3
700
701	lea	32(%rdi), %rdi
702	jz	L(shr_7_gobble_loop)
703	pand	%xmm0, %xmm3
704
705	cmp	$0, %rcx
706	jge	L(shr_7_gobble_next)
707	inc	%edx
708	add	$32, %rcx
709L(shr_7_gobble_next):
710	test	%edx, %edx
711	jnz	L(exit)
712
713	pmovmskb %xmm3, %edx
714	movdqa	%xmm0, %xmm1
715	lea	32(%rdi), %rdi
716	lea	32(%rsi), %rsi
717	sub	$0xffff, %edx
718	jnz	L(exit)
719
720	lea	7(%rsi), %rsi
721	add	%rcx, %rsi
722	add	%rcx, %rdi
723	jmp	L(less48bytes)
724
725# endif
726
727	.p2align 4
728L(shr_8):
729	cmp	$80, %rcx
730	lea	-48(%rcx), %rcx
731	mov	%edx, %eax
732	jae	L(shr_8_gobble)
733
734	movdqa	16(%rsi), %xmm1
735	movdqa	%xmm1, %xmm2
736	palignr	$8, (%rsi), %xmm1
737	pcmpeqb	(%rdi), %xmm1
738
739	movdqa	32(%rsi), %xmm3
740	palignr	$8, %xmm2, %xmm3
741	pcmpeqb	16(%rdi), %xmm3
742
743	pand	%xmm1, %xmm3
744	pmovmskb %xmm3, %edx
745	lea	32(%rdi), %rdi
746	lea	32(%rsi), %rsi
747	sub	$0xffff, %edx
748	jnz	L(exit)
749	add	$8, %rsi
750	add	%rcx, %rsi
751	add	%rcx, %rdi
752	jmp	L(less48bytes)
753
754	.p2align 4
755L(shr_8_gobble):
756	sub	$32, %rcx
757	movdqa	16(%rsi), %xmm0
758	palignr	$8, (%rsi), %xmm0
759	pcmpeqb	(%rdi), %xmm0
760
761	movdqa	32(%rsi), %xmm3
762	palignr	$8, 16(%rsi), %xmm3
763	pcmpeqb	16(%rdi), %xmm3
764
765L(shr_8_gobble_loop):
766	pand	%xmm0, %xmm3
767	sub	$32, %rcx
768	pmovmskb %xmm3, %edx
769	movdqa	%xmm0, %xmm1
770
771	movdqa	64(%rsi), %xmm3
772	palignr	$8, 48(%rsi), %xmm3
773	sbb	$0xffff, %edx
774	movdqa	48(%rsi), %xmm0
775	palignr	$8, 32(%rsi), %xmm0
776	pcmpeqb	32(%rdi), %xmm0
777	lea	32(%rsi), %rsi
778	pcmpeqb	48(%rdi), %xmm3
779
780	lea	32(%rdi), %rdi
781	jz	L(shr_8_gobble_loop)
782	pand	%xmm0, %xmm3
783
784	cmp	$0, %rcx
785	jge	L(shr_8_gobble_next)
786	inc	%edx
787	add	$32, %rcx
788L(shr_8_gobble_next):
789	test	%edx, %edx
790	jnz	L(exit)
791
792	pmovmskb %xmm3, %edx
793	movdqa	%xmm0, %xmm1
794	lea	32(%rdi), %rdi
795	lea	32(%rsi), %rsi
796	sub	$0xffff, %edx
797	jnz	L(exit)
798
799	lea	8(%rsi), %rsi
800	add	%rcx, %rsi
801	add	%rcx, %rdi
802	jmp	L(less48bytes)
803
804# ifndef USE_AS_WMEMCMP
805
806	.p2align 4
807L(shr_9):
808	cmp	$80, %rcx
809	lea	-48(%rcx), %rcx
810	mov	%edx, %eax
811	jae	L(shr_9_gobble)
812
813	movdqa	16(%rsi), %xmm1
814	movdqa	%xmm1, %xmm2
815	palignr	$9, (%rsi), %xmm1
816	pcmpeqb	(%rdi), %xmm1
817
818	movdqa	32(%rsi), %xmm3
819	palignr	$9, %xmm2, %xmm3
820	pcmpeqb	16(%rdi), %xmm3
821
822	pand	%xmm1, %xmm3
823	pmovmskb %xmm3, %edx
824	lea	32(%rdi), %rdi
825	lea	32(%rsi), %rsi
826	sub	$0xffff, %edx
827	jnz	L(exit)
828	add	$9, %rsi
829	add	%rcx, %rsi
830	add	%rcx, %rdi
831	jmp	L(less48bytes)
832
833	.p2align 4
834L(shr_9_gobble):
835	sub	$32, %rcx
836	movdqa	16(%rsi), %xmm0
837	palignr	$9, (%rsi), %xmm0
838	pcmpeqb	(%rdi), %xmm0
839
840	movdqa	32(%rsi), %xmm3
841	palignr	$9, 16(%rsi), %xmm3
842	pcmpeqb	16(%rdi), %xmm3
843
844L(shr_9_gobble_loop):
845	pand	%xmm0, %xmm3
846	sub	$32, %rcx
847	pmovmskb %xmm3, %edx
848	movdqa	%xmm0, %xmm1
849
850	movdqa	64(%rsi), %xmm3
851	palignr	$9, 48(%rsi), %xmm3
852	sbb	$0xffff, %edx
853	movdqa	48(%rsi), %xmm0
854	palignr	$9, 32(%rsi), %xmm0
855	pcmpeqb	32(%rdi), %xmm0
856	lea	32(%rsi), %rsi
857	pcmpeqb	48(%rdi), %xmm3
858
859	lea	32(%rdi), %rdi
860	jz	L(shr_9_gobble_loop)
861	pand	%xmm0, %xmm3
862
863	cmp	$0, %rcx
864	jge	L(shr_9_gobble_next)
865	inc	%edx
866	add	$32, %rcx
867L(shr_9_gobble_next):
868	test	%edx, %edx
869	jnz	L(exit)
870
871	pmovmskb %xmm3, %edx
872	movdqa	%xmm0, %xmm1
873	lea	32(%rdi), %rdi
874	lea	32(%rsi), %rsi
875	sub	$0xffff, %edx
876	jnz	L(exit)
877
878	lea	9(%rsi), %rsi
879	add	%rcx, %rsi
880	add	%rcx, %rdi
881	jmp	L(less48bytes)
882
883	.p2align 4
884L(shr_10):
885	cmp	$80, %rcx
886	lea	-48(%rcx), %rcx
887	mov	%edx, %eax
888	jae	L(shr_10_gobble)
889
890	movdqa	16(%rsi), %xmm1
891	movdqa	%xmm1, %xmm2
892	palignr	$10, (%rsi), %xmm1
893	pcmpeqb	(%rdi), %xmm1
894
895	movdqa	32(%rsi), %xmm3
896	palignr	$10, %xmm2, %xmm3
897	pcmpeqb	16(%rdi), %xmm3
898
899	pand	%xmm1, %xmm3
900	pmovmskb %xmm3, %edx
901	lea	32(%rdi), %rdi
902	lea	32(%rsi), %rsi
903	sub	$0xffff, %edx
904	jnz	L(exit)
905	add	$10, %rsi
906	add	%rcx, %rsi
907	add	%rcx, %rdi
908	jmp	L(less48bytes)
909
910	.p2align 4
911L(shr_10_gobble):
912	sub	$32, %rcx
913	movdqa	16(%rsi), %xmm0
914	palignr	$10, (%rsi), %xmm0
915	pcmpeqb	(%rdi), %xmm0
916
917	movdqa	32(%rsi), %xmm3
918	palignr	$10, 16(%rsi), %xmm3
919	pcmpeqb	16(%rdi), %xmm3
920
921L(shr_10_gobble_loop):
922	pand	%xmm0, %xmm3
923	sub	$32, %rcx
924	pmovmskb %xmm3, %edx
925	movdqa	%xmm0, %xmm1
926
927	movdqa	64(%rsi), %xmm3
928	palignr	$10, 48(%rsi), %xmm3
929	sbb	$0xffff, %edx
930	movdqa	48(%rsi), %xmm0
931	palignr	$10, 32(%rsi), %xmm0
932	pcmpeqb	32(%rdi), %xmm0
933	lea	32(%rsi), %rsi
934	pcmpeqb	48(%rdi), %xmm3
935
936	lea	32(%rdi), %rdi
937	jz	L(shr_10_gobble_loop)
938	pand	%xmm0, %xmm3
939
940	cmp	$0, %rcx
941	jge	L(shr_10_gobble_next)
942	inc	%edx
943	add	$32, %rcx
944L(shr_10_gobble_next):
945	test	%edx, %edx
946	jnz	L(exit)
947
948	pmovmskb %xmm3, %edx
949	movdqa	%xmm0, %xmm1
950	lea	32(%rdi), %rdi
951	lea	32(%rsi), %rsi
952	sub	$0xffff, %edx
953	jnz	L(exit)
954
955	lea	10(%rsi), %rsi
956	add	%rcx, %rsi
957	add	%rcx, %rdi
958	jmp	L(less48bytes)
959
960	.p2align 4
961L(shr_11):
962	cmp	$80, %rcx
963	lea	-48(%rcx), %rcx
964	mov	%edx, %eax
965	jae	L(shr_11_gobble)
966
967	movdqa	16(%rsi), %xmm1
968	movdqa	%xmm1, %xmm2
969	palignr	$11, (%rsi), %xmm1
970	pcmpeqb	(%rdi), %xmm1
971
972	movdqa	32(%rsi), %xmm3
973	palignr	$11, %xmm2, %xmm3
974	pcmpeqb	16(%rdi), %xmm3
975
976	pand	%xmm1, %xmm3
977	pmovmskb %xmm3, %edx
978	lea	32(%rdi), %rdi
979	lea	32(%rsi), %rsi
980	sub	$0xffff, %edx
981	jnz	L(exit)
982	add	$11, %rsi
983	add	%rcx, %rsi
984	add	%rcx, %rdi
985	jmp	L(less48bytes)
986
987	.p2align 4
988L(shr_11_gobble):
989	sub	$32, %rcx
990	movdqa	16(%rsi), %xmm0
991	palignr	$11, (%rsi), %xmm0
992	pcmpeqb	(%rdi), %xmm0
993
994	movdqa	32(%rsi), %xmm3
995	palignr	$11, 16(%rsi), %xmm3
996	pcmpeqb	16(%rdi), %xmm3
997
998L(shr_11_gobble_loop):
999	pand	%xmm0, %xmm3
1000	sub	$32, %rcx
1001	pmovmskb %xmm3, %edx
1002	movdqa	%xmm0, %xmm1
1003
1004	movdqa	64(%rsi), %xmm3
1005	palignr	$11, 48(%rsi), %xmm3
1006	sbb	$0xffff, %edx
1007	movdqa	48(%rsi), %xmm0
1008	palignr	$11, 32(%rsi), %xmm0
1009	pcmpeqb	32(%rdi), %xmm0
1010	lea	32(%rsi), %rsi
1011	pcmpeqb	48(%rdi), %xmm3
1012
1013	lea	32(%rdi), %rdi
1014	jz	L(shr_11_gobble_loop)
1015	pand	%xmm0, %xmm3
1016
1017	cmp	$0, %rcx
1018	jge	L(shr_11_gobble_next)
1019	inc	%edx
1020	add	$32, %rcx
1021L(shr_11_gobble_next):
1022	test	%edx, %edx
1023	jnz	L(exit)
1024
1025	pmovmskb %xmm3, %edx
1026	movdqa	%xmm0, %xmm1
1027	lea	32(%rdi), %rdi
1028	lea	32(%rsi), %rsi
1029	sub	$0xffff, %edx
1030	jnz	L(exit)
1031
1032	lea	11(%rsi), %rsi
1033	add	%rcx, %rsi
1034	add	%rcx, %rdi
1035	jmp	L(less48bytes)
1036
1037# endif
1038
1039	.p2align 4
1040L(shr_12):
1041	cmp	$80, %rcx
1042	lea	-48(%rcx), %rcx
1043	mov	%edx, %eax
1044	jae	L(shr_12_gobble)
1045
1046	movdqa	16(%rsi), %xmm1
1047	movdqa	%xmm1, %xmm2
1048	palignr	$12, (%rsi), %xmm1
1049	pcmpeqb	(%rdi), %xmm1
1050
1051	movdqa	32(%rsi), %xmm3
1052	palignr	$12, %xmm2, %xmm3
1053	pcmpeqb	16(%rdi), %xmm3
1054
1055	pand	%xmm1, %xmm3
1056	pmovmskb %xmm3, %edx
1057	lea	32(%rdi), %rdi
1058	lea	32(%rsi), %rsi
1059	sub	$0xffff, %edx
1060	jnz	L(exit)
1061	add	$12, %rsi
1062	add	%rcx, %rsi
1063	add	%rcx, %rdi
1064	jmp	L(less48bytes)
1065
1066	.p2align 4
1067L(shr_12_gobble):
1068	sub	$32, %rcx
1069	movdqa	16(%rsi), %xmm0
1070	palignr	$12, (%rsi), %xmm0
1071	pcmpeqb	(%rdi), %xmm0
1072
1073	movdqa	32(%rsi), %xmm3
1074	palignr	$12, 16(%rsi), %xmm3
1075	pcmpeqb	16(%rdi), %xmm3
1076
1077L(shr_12_gobble_loop):
1078	pand	%xmm0, %xmm3
1079	sub	$32, %rcx
1080	pmovmskb %xmm3, %edx
1081	movdqa	%xmm0, %xmm1
1082
1083	movdqa	64(%rsi), %xmm3
1084	palignr	$12, 48(%rsi), %xmm3
1085	sbb	$0xffff, %edx
1086	movdqa	48(%rsi), %xmm0
1087	palignr	$12, 32(%rsi), %xmm0
1088	pcmpeqb	32(%rdi), %xmm0
1089	lea	32(%rsi), %rsi
1090	pcmpeqb	48(%rdi), %xmm3
1091
1092	lea	32(%rdi), %rdi
1093	jz	L(shr_12_gobble_loop)
1094	pand	%xmm0, %xmm3
1095
1096	cmp	$0, %rcx
1097	jge	L(shr_12_gobble_next)
1098	inc	%edx
1099	add	$32, %rcx
1100L(shr_12_gobble_next):
1101	test	%edx, %edx
1102	jnz	L(exit)
1103
1104	pmovmskb %xmm3, %edx
1105	movdqa	%xmm0, %xmm1
1106	lea	32(%rdi), %rdi
1107	lea	32(%rsi), %rsi
1108	sub	$0xffff, %edx
1109	jnz	L(exit)
1110
1111	lea	12(%rsi), %rsi
1112	add	%rcx, %rsi
1113	add	%rcx, %rdi
1114	jmp	L(less48bytes)
1115
1116# ifndef USE_AS_WMEMCMP
1117
1118	.p2align 4
1119L(shr_13):
1120	cmp	$80, %rcx
1121	lea	-48(%rcx), %rcx
1122	mov	%edx, %eax
1123	jae	L(shr_13_gobble)
1124
1125	movdqa	16(%rsi), %xmm1
1126	movdqa	%xmm1, %xmm2
1127	palignr	$13, (%rsi), %xmm1
1128	pcmpeqb	(%rdi), %xmm1
1129
1130	movdqa	32(%rsi), %xmm3
1131	palignr	$13, %xmm2, %xmm3
1132	pcmpeqb	16(%rdi), %xmm3
1133
1134	pand	%xmm1, %xmm3
1135	pmovmskb %xmm3, %edx
1136	lea	32(%rdi), %rdi
1137	lea	32(%rsi), %rsi
1138	sub	$0xffff, %edx
1139	jnz	L(exit)
1140	add	$13, %rsi
1141	add	%rcx, %rsi
1142	add	%rcx, %rdi
1143	jmp	L(less48bytes)
1144
1145	.p2align 4
1146L(shr_13_gobble):
1147	sub	$32, %rcx
1148	movdqa	16(%rsi), %xmm0
1149	palignr	$13, (%rsi), %xmm0
1150	pcmpeqb	(%rdi), %xmm0
1151
1152	movdqa	32(%rsi), %xmm3
1153	palignr	$13, 16(%rsi), %xmm3
1154	pcmpeqb	16(%rdi), %xmm3
1155
1156L(shr_13_gobble_loop):
1157	pand	%xmm0, %xmm3
1158	sub	$32, %rcx
1159	pmovmskb %xmm3, %edx
1160	movdqa	%xmm0, %xmm1
1161
1162	movdqa	64(%rsi), %xmm3
1163	palignr	$13, 48(%rsi), %xmm3
1164	sbb	$0xffff, %edx
1165	movdqa	48(%rsi), %xmm0
1166	palignr	$13, 32(%rsi), %xmm0
1167	pcmpeqb	32(%rdi), %xmm0
1168	lea	32(%rsi), %rsi
1169	pcmpeqb	48(%rdi), %xmm3
1170
1171	lea	32(%rdi), %rdi
1172	jz	L(shr_13_gobble_loop)
1173	pand	%xmm0, %xmm3
1174
1175	cmp	$0, %rcx
1176	jge	L(shr_13_gobble_next)
1177	inc	%edx
1178	add	$32, %rcx
1179L(shr_13_gobble_next):
1180	test	%edx, %edx
1181	jnz	L(exit)
1182
1183	pmovmskb %xmm3, %edx
1184	movdqa	%xmm0, %xmm1
1185	lea	32(%rdi), %rdi
1186	lea	32(%rsi), %rsi
1187	sub	$0xffff, %edx
1188	jnz	L(exit)
1189
1190	lea	13(%rsi), %rsi
1191	add	%rcx, %rsi
1192	add	%rcx, %rdi
1193	jmp	L(less48bytes)
1194
1195	.p2align 4
1196L(shr_14):
1197	cmp	$80, %rcx
1198	lea	-48(%rcx), %rcx
1199	mov	%edx, %eax
1200	jae	L(shr_14_gobble)
1201
1202	movdqa	16(%rsi), %xmm1
1203	movdqa	%xmm1, %xmm2
1204	palignr	$14, (%rsi), %xmm1
1205	pcmpeqb	(%rdi), %xmm1
1206
1207	movdqa	32(%rsi), %xmm3
1208	palignr	$14, %xmm2, %xmm3
1209	pcmpeqb	16(%rdi), %xmm3
1210
1211	pand	%xmm1, %xmm3
1212	pmovmskb %xmm3, %edx
1213	lea	32(%rdi), %rdi
1214	lea	32(%rsi), %rsi
1215	sub	$0xffff, %edx
1216	jnz	L(exit)
1217	add	$14, %rsi
1218	add	%rcx, %rsi
1219	add	%rcx, %rdi
1220	jmp	L(less48bytes)
1221
1222	.p2align 4
1223L(shr_14_gobble):
1224	sub	$32, %rcx
1225	movdqa	16(%rsi), %xmm0
1226	palignr	$14, (%rsi), %xmm0
1227	pcmpeqb	(%rdi), %xmm0
1228
1229	movdqa	32(%rsi), %xmm3
1230	palignr	$14, 16(%rsi), %xmm3
1231	pcmpeqb	16(%rdi), %xmm3
1232
1233L(shr_14_gobble_loop):
1234	pand	%xmm0, %xmm3
1235	sub	$32, %rcx
1236	pmovmskb %xmm3, %edx
1237	movdqa	%xmm0, %xmm1
1238
1239	movdqa	64(%rsi), %xmm3
1240	palignr	$14, 48(%rsi), %xmm3
1241	sbb	$0xffff, %edx
1242	movdqa	48(%rsi), %xmm0
1243	palignr	$14, 32(%rsi), %xmm0
1244	pcmpeqb	32(%rdi), %xmm0
1245	lea	32(%rsi), %rsi
1246	pcmpeqb	48(%rdi), %xmm3
1247
1248	lea	32(%rdi), %rdi
1249	jz	L(shr_14_gobble_loop)
1250	pand	%xmm0, %xmm3
1251
1252	cmp	$0, %rcx
1253	jge	L(shr_14_gobble_next)
1254	inc	%edx
1255	add	$32, %rcx
1256L(shr_14_gobble_next):
1257	test	%edx, %edx
1258	jnz	L(exit)
1259
1260	pmovmskb %xmm3, %edx
1261	movdqa	%xmm0, %xmm1
1262	lea	32(%rdi), %rdi
1263	lea	32(%rsi), %rsi
1264	sub	$0xffff, %edx
1265	jnz	L(exit)
1266
1267	lea	14(%rsi), %rsi
1268	add	%rcx, %rsi
1269	add	%rcx, %rdi
1270	jmp	L(less48bytes)
1271
1272	.p2align 4
1273L(shr_15):
1274	cmp	$80, %rcx
1275	lea	-48(%rcx), %rcx
1276	mov	%edx, %eax
1277	jae	L(shr_15_gobble)
1278
1279	movdqa	16(%rsi), %xmm1
1280	movdqa	%xmm1, %xmm2
1281	palignr	$15, (%rsi), %xmm1
1282	pcmpeqb	(%rdi), %xmm1
1283
1284	movdqa	32(%rsi), %xmm3
1285	palignr	$15, %xmm2, %xmm3
1286	pcmpeqb	16(%rdi), %xmm3
1287
1288	pand	%xmm1, %xmm3
1289	pmovmskb %xmm3, %edx
1290	lea	32(%rdi), %rdi
1291	lea	32(%rsi), %rsi
1292	sub	$0xffff, %edx
1293	jnz	L(exit)
1294	add	$15, %rsi
1295	add	%rcx, %rsi
1296	add	%rcx, %rdi
1297	jmp	L(less48bytes)
1298
1299	.p2align 4
1300L(shr_15_gobble):
1301	sub	$32, %rcx
1302	movdqa	16(%rsi), %xmm0
1303	palignr	$15, (%rsi), %xmm0
1304	pcmpeqb	(%rdi), %xmm0
1305
1306	movdqa	32(%rsi), %xmm3
1307	palignr	$15, 16(%rsi), %xmm3
1308	pcmpeqb	16(%rdi), %xmm3
1309
1310L(shr_15_gobble_loop):
1311	pand	%xmm0, %xmm3
1312	sub	$32, %rcx
1313	pmovmskb %xmm3, %edx
1314	movdqa	%xmm0, %xmm1
1315
1316	movdqa	64(%rsi), %xmm3
1317	palignr	$15, 48(%rsi), %xmm3
1318	sbb	$0xffff, %edx
1319	movdqa	48(%rsi), %xmm0
1320	palignr	$15, 32(%rsi), %xmm0
1321	pcmpeqb	32(%rdi), %xmm0
1322	lea	32(%rsi), %rsi
1323	pcmpeqb	48(%rdi), %xmm3
1324
1325	lea	32(%rdi), %rdi
1326	jz	L(shr_15_gobble_loop)
1327	pand	%xmm0, %xmm3
1328
1329	cmp	$0, %rcx
1330	jge	L(shr_15_gobble_next)
1331	inc	%edx
1332	add	$32, %rcx
1333L(shr_15_gobble_next):
1334	test	%edx, %edx
1335	jnz	L(exit)
1336
1337	pmovmskb %xmm3, %edx
1338	movdqa	%xmm0, %xmm1
1339	lea	32(%rdi), %rdi
1340	lea	32(%rsi), %rsi
1341	sub	$0xffff, %edx
1342	jnz	L(exit)
1343
1344	lea	15(%rsi), %rsi
1345	add	%rcx, %rsi
1346	add	%rcx, %rdi
1347	jmp	L(less48bytes)
1348# endif
1349	.p2align 4
1350L(exit):
1351	pmovmskb %xmm1, %r8d
1352	sub	$0xffff, %r8d
1353	jz	L(first16bytes)
1354	lea	-16(%rsi), %rsi
1355	lea	-16(%rdi), %rdi
1356	mov	%r8d, %edx
1357L(first16bytes):
1358	add	%rax, %rsi
1359L(less16bytes):
1360# ifndef USE_AS_WMEMCMP
1361	test	%dl, %dl
1362	jz	L(next_24_bytes)
1363
1364	test	$0x01, %dl
1365	jnz	L(Byte16)
1366
1367	test	$0x02, %dl
1368	jnz	L(Byte17)
1369
1370	test	$0x04, %dl
1371	jnz	L(Byte18)
1372
1373	test	$0x08, %dl
1374	jnz	L(Byte19)
1375
1376	test	$0x10, %dl
1377	jnz	L(Byte20)
1378
1379	test	$0x20, %dl
1380	jnz	L(Byte21)
1381
1382	test	$0x40, %dl
1383	jnz	L(Byte22)
1384
1385	movzbl	-9(%rdi), %eax
1386	movzbl	-9(%rsi), %edx
1387	sub	%edx, %eax
1388	ret
1389
1390	.p2align 4
1391L(Byte16):
1392	movzbl	-16(%rdi), %eax
1393	movzbl	-16(%rsi), %edx
1394	sub	%edx, %eax
1395	ret
1396
1397	.p2align 4
1398L(Byte17):
1399	movzbl	-15(%rdi), %eax
1400	movzbl	-15(%rsi), %edx
1401	sub	%edx, %eax
1402	ret
1403
1404	.p2align 4
1405L(Byte18):
1406	movzbl	-14(%rdi), %eax
1407	movzbl	-14(%rsi), %edx
1408	sub	%edx, %eax
1409	ret
1410
1411	.p2align 4
1412L(Byte19):
1413	movzbl	-13(%rdi), %eax
1414	movzbl	-13(%rsi), %edx
1415	sub	%edx, %eax
1416	ret
1417
1418	.p2align 4
1419L(Byte20):
1420	movzbl	-12(%rdi), %eax
1421	movzbl	-12(%rsi), %edx
1422	sub	%edx, %eax
1423	ret
1424
1425	.p2align 4
1426L(Byte21):
1427	movzbl	-11(%rdi), %eax
1428	movzbl	-11(%rsi), %edx
1429	sub	%edx, %eax
1430	ret
1431
1432	.p2align 4
1433L(Byte22):
1434	movzbl	-10(%rdi), %eax
1435	movzbl	-10(%rsi), %edx
1436	sub	%edx, %eax
1437	ret
1438
1439	.p2align 4
1440L(next_24_bytes):
1441	lea	8(%rdi), %rdi
1442	lea	8(%rsi), %rsi
1443	test	$0x01, %dh
1444	jnz	L(Byte16)
1445
1446	test	$0x02, %dh
1447	jnz	L(Byte17)
1448
1449	test	$0x04, %dh
1450	jnz	L(Byte18)
1451
1452	test	$0x08, %dh
1453	jnz	L(Byte19)
1454
1455	test	$0x10, %dh
1456	jnz	L(Byte20)
1457
1458	test	$0x20, %dh
1459	jnz	L(Byte21)
1460
1461	test	$0x40, %dh
1462	jnz	L(Byte22)
1463
1464	movzbl	-9(%rdi), %eax
1465	movzbl	-9(%rsi), %edx
1466	sub	%edx, %eax
1467	ret
1468# else
1469/* special for wmemcmp */
1470	xor	%eax, %eax
1471	test	%dl, %dl
1472	jz	L(next_two_double_words)
1473	and	$15, %dl
1474	jz	L(second_double_word)
1475	mov	-16(%rdi), %eax
1476	cmp	-16(%rsi), %eax
1477	jne	L(find_diff)
1478	ret
1479
1480	.p2align 4
1481L(second_double_word):
1482	mov	-12(%rdi), %eax
1483	cmp	-12(%rsi), %eax
1484	jne	L(find_diff)
1485	ret
1486
1487	.p2align 4
1488L(next_two_double_words):
1489	and	$15, %dh
1490	jz	L(fourth_double_word)
1491	mov	-8(%rdi), %eax
1492	cmp	-8(%rsi), %eax
1493	jne	L(find_diff)
1494	ret
1495
1496	.p2align 4
1497L(fourth_double_word):
1498	mov	-4(%rdi), %eax
1499	cmp	-4(%rsi), %eax
1500	jne	L(find_diff)
1501	ret
1502# endif
1503
1504	.p2align 4
1505L(less48bytes):
1506	cmp	$8, %ecx
1507	jae	L(more8bytes)
1508	cmp	$0, %ecx
1509	je	L(0bytes)
1510# ifndef USE_AS_WMEMCMP
1511	cmp	$1, %ecx
1512	je	L(1bytes)
1513	cmp	$2, %ecx
1514	je	L(2bytes)
1515	cmp	$3, %ecx
1516	je	L(3bytes)
1517	cmp	$4, %ecx
1518	je	L(4bytes)
1519	cmp	$5, %ecx
1520	je	L(5bytes)
1521	cmp	$6, %ecx
1522	je	L(6bytes)
1523	jmp	L(7bytes)
1524# else
1525	jmp	L(4bytes)
1526# endif
1527
1528	.p2align 4
1529L(more8bytes):
1530	cmp	$16, %ecx
1531	jae	L(more16bytes)
1532	cmp	$8, %ecx
1533	je	L(8bytes)
1534# ifndef USE_AS_WMEMCMP
1535	cmp	$9, %ecx
1536	je	L(9bytes)
1537	cmp	$10, %ecx
1538	je	L(10bytes)
1539	cmp	$11, %ecx
1540	je	L(11bytes)
1541	cmp	$12, %ecx
1542	je	L(12bytes)
1543	cmp	$13, %ecx
1544	je	L(13bytes)
1545	cmp	$14, %ecx
1546	je	L(14bytes)
1547	jmp	L(15bytes)
1548# else
1549	jmp	L(12bytes)
1550# endif
1551
1552	.p2align 4
1553L(more16bytes):
1554	cmp	$24, %ecx
1555	jae	L(more24bytes)
1556	cmp	$16, %ecx
1557	je	L(16bytes)
1558# ifndef USE_AS_WMEMCMP
1559	cmp	$17, %ecx
1560	je	L(17bytes)
1561	cmp	$18, %ecx
1562	je	L(18bytes)
1563	cmp	$19, %ecx
1564	je	L(19bytes)
1565	cmp	$20, %ecx
1566	je	L(20bytes)
1567	cmp	$21, %ecx
1568	je	L(21bytes)
1569	cmp	$22, %ecx
1570	je	L(22bytes)
1571	jmp	L(23bytes)
1572# else
1573	jmp	L(20bytes)
1574# endif
1575
1576	.p2align 4
1577L(more24bytes):
1578	cmp	$32, %ecx
1579	jae	L(more32bytes)
1580	cmp	$24, %ecx
1581	je	L(24bytes)
1582# ifndef USE_AS_WMEMCMP
1583	cmp	$25, %ecx
1584	je	L(25bytes)
1585	cmp	$26, %ecx
1586	je	L(26bytes)
1587	cmp	$27, %ecx
1588	je	L(27bytes)
1589	cmp	$28, %ecx
1590	je	L(28bytes)
1591	cmp	$29, %ecx
1592	je	L(29bytes)
1593	cmp	$30, %ecx
1594	je	L(30bytes)
1595	jmp	L(31bytes)
1596# else
1597	jmp	L(28bytes)
1598# endif
1599
1600	.p2align 4
1601L(more32bytes):
1602	cmp	$40, %ecx
1603	jae	L(more40bytes)
1604	cmp	$32, %ecx
1605	je	L(32bytes)
1606# ifndef USE_AS_WMEMCMP
1607	cmp	$33, %ecx
1608	je	L(33bytes)
1609	cmp	$34, %ecx
1610	je	L(34bytes)
1611	cmp	$35, %ecx
1612	je	L(35bytes)
1613	cmp	$36, %ecx
1614	je	L(36bytes)
1615	cmp	$37, %ecx
1616	je	L(37bytes)
1617	cmp	$38, %ecx
1618	je	L(38bytes)
1619	jmp	L(39bytes)
1620# else
1621	jmp	L(36bytes)
1622# endif
1623
1624	.p2align 4
1625L(more40bytes):
1626	cmp	$40, %ecx
1627	je	L(40bytes)
1628# ifndef USE_AS_WMEMCMP
1629	cmp	$41, %ecx
1630	je	L(41bytes)
1631	cmp	$42, %ecx
1632	je	L(42bytes)
1633	cmp	$43, %ecx
1634	je	L(43bytes)
1635	cmp	$44, %ecx
1636	je	L(44bytes)
1637	cmp	$45, %ecx
1638	je	L(45bytes)
1639	cmp	$46, %ecx
1640	je	L(46bytes)
1641	jmp	L(47bytes)
1642
1643	.p2align 4
1644L(44bytes):
1645	movl	-44(%rdi), %eax
1646	movl	-44(%rsi), %ecx
1647	cmp	%ecx, %eax
1648	jne	L(find_diff)
1649L(40bytes):
1650	movl	-40(%rdi), %eax
1651	movl	-40(%rsi), %ecx
1652	cmp	%ecx, %eax
1653	jne	L(find_diff)
1654L(36bytes):
1655	movl	-36(%rdi), %eax
1656	movl	-36(%rsi), %ecx
1657	cmp	%ecx, %eax
1658	jne	L(find_diff)
1659L(32bytes):
1660	movl	-32(%rdi), %eax
1661	movl	-32(%rsi), %ecx
1662	cmp	%ecx, %eax
1663	jne	L(find_diff)
1664L(28bytes):
1665	movl	-28(%rdi), %eax
1666	movl	-28(%rsi), %ecx
1667	cmp	%ecx, %eax
1668	jne	L(find_diff)
1669L(24bytes):
1670	movl	-24(%rdi), %eax
1671	movl	-24(%rsi), %ecx
1672	cmp	%ecx, %eax
1673	jne	L(find_diff)
1674L(20bytes):
1675	movl	-20(%rdi), %eax
1676	movl	-20(%rsi), %ecx
1677	cmp	%ecx, %eax
1678	jne	L(find_diff)
1679L(16bytes):
1680	movl	-16(%rdi), %eax
1681	movl	-16(%rsi), %ecx
1682	cmp	%ecx, %eax
1683	jne	L(find_diff)
1684L(12bytes):
1685	movl	-12(%rdi), %eax
1686	movl	-12(%rsi), %ecx
1687	cmp	%ecx, %eax
1688	jne	L(find_diff)
1689L(8bytes):
1690	movl	-8(%rdi), %eax
1691	movl	-8(%rsi), %ecx
1692	cmp	%ecx, %eax
1693	jne	L(find_diff)
1694L(4bytes):
1695	movl	-4(%rdi), %eax
1696	movl	-4(%rsi), %ecx
1697	cmp	%ecx, %eax
1698	jne	L(find_diff)
1699L(0bytes):
1700	xor	%eax, %eax
1701	ret
1702# else
1703	.p2align 4
1704L(44bytes):
1705	movl	-44(%rdi), %eax
1706	cmp	-44(%rsi), %eax
1707	jne	L(find_diff)
1708L(40bytes):
1709	movl	-40(%rdi), %eax
1710	cmp	-40(%rsi), %eax
1711	jne	L(find_diff)
1712L(36bytes):
1713	movl	-36(%rdi), %eax
1714	cmp	-36(%rsi), %eax
1715	jne	L(find_diff)
1716L(32bytes):
1717	movl	-32(%rdi), %eax
1718	cmp	-32(%rsi), %eax
1719	jne	L(find_diff)
1720L(28bytes):
1721	movl	-28(%rdi), %eax
1722	cmp	-28(%rsi), %eax
1723	jne	L(find_diff)
1724L(24bytes):
1725	movl	-24(%rdi), %eax
1726	cmp	-24(%rsi), %eax
1727	jne	L(find_diff)
1728L(20bytes):
1729	movl	-20(%rdi), %eax
1730	cmp	-20(%rsi), %eax
1731	jne	L(find_diff)
1732L(16bytes):
1733	movl	-16(%rdi), %eax
1734	cmp	-16(%rsi), %eax
1735	jne	L(find_diff)
1736L(12bytes):
1737	movl	-12(%rdi), %eax
1738	cmp	-12(%rsi), %eax
1739	jne	L(find_diff)
1740L(8bytes):
1741	movl	-8(%rdi), %eax
1742	cmp	-8(%rsi), %eax
1743	jne	L(find_diff)
1744L(4bytes):
1745	movl	-4(%rdi), %eax
1746	cmp	-4(%rsi), %eax
1747	jne	L(find_diff)
1748L(0bytes):
1749	xor	%eax, %eax
1750	ret
1751# endif
1752
1753# ifndef USE_AS_WMEMCMP
1754	.p2align 4
1755L(45bytes):
1756	movl	-45(%rdi), %eax
1757	movl	-45(%rsi), %ecx
1758	cmp	%ecx, %eax
1759	jne	L(find_diff)
1760L(41bytes):
1761	movl	-41(%rdi), %eax
1762	movl	-41(%rsi), %ecx
1763	cmp	%ecx, %eax
1764	jne	L(find_diff)
1765L(37bytes):
1766	movl	-37(%rdi), %eax
1767	movl	-37(%rsi), %ecx
1768	cmp	%ecx, %eax
1769	jne	L(find_diff)
1770L(33bytes):
1771	movl	-33(%rdi), %eax
1772	movl	-33(%rsi), %ecx
1773	cmp	%ecx, %eax
1774	jne	L(find_diff)
1775L(29bytes):
1776	movl	-29(%rdi), %eax
1777	movl	-29(%rsi), %ecx
1778	cmp	%ecx, %eax
1779	jne	L(find_diff)
1780L(25bytes):
1781	movl	-25(%rdi), %eax
1782	movl	-25(%rsi), %ecx
1783	cmp	%ecx, %eax
1784	jne	L(find_diff)
1785L(21bytes):
1786	movl	-21(%rdi), %eax
1787	movl	-21(%rsi), %ecx
1788	cmp	%ecx, %eax
1789	jne	L(find_diff)
1790L(17bytes):
1791	movl	-17(%rdi), %eax
1792	movl	-17(%rsi), %ecx
1793	cmp	%ecx, %eax
1794	jne	L(find_diff)
1795L(13bytes):
1796	movl	-13(%rdi), %eax
1797	movl	-13(%rsi), %ecx
1798	cmp	%ecx, %eax
1799	jne	L(find_diff)
1800L(9bytes):
1801	movl	-9(%rdi), %eax
1802	movl	-9(%rsi), %ecx
1803	cmp	%ecx, %eax
1804	jne	L(find_diff)
1805L(5bytes):
1806	movl	-5(%rdi), %eax
1807	movl	-5(%rsi), %ecx
1808	cmp	%ecx, %eax
1809	jne	L(find_diff)
1810L(1bytes):
1811	movzbl	-1(%rdi), %eax
1812	cmpb	-1(%rsi), %al
1813	jne	L(set)
1814	xor	%eax, %eax
1815	ret
1816
1817	.p2align 4
1818L(46bytes):
1819	movl	-46(%rdi), %eax
1820	movl	-46(%rsi), %ecx
1821	cmp	%ecx, %eax
1822	jne	L(find_diff)
1823L(42bytes):
1824	movl	-42(%rdi), %eax
1825	movl	-42(%rsi), %ecx
1826	cmp	%ecx, %eax
1827	jne	L(find_diff)
1828L(38bytes):
1829	movl	-38(%rdi), %eax
1830	movl	-38(%rsi), %ecx
1831	cmp	%ecx, %eax
1832	jne	L(find_diff)
1833L(34bytes):
1834	movl	-34(%rdi), %eax
1835	movl	-34(%rsi), %ecx
1836	cmp	%ecx, %eax
1837	jne	L(find_diff)
1838L(30bytes):
1839	movl	-30(%rdi), %eax
1840	movl	-30(%rsi), %ecx
1841	cmp	%ecx, %eax
1842	jne	L(find_diff)
1843L(26bytes):
1844	movl	-26(%rdi), %eax
1845	movl	-26(%rsi), %ecx
1846	cmp	%ecx, %eax
1847	jne	L(find_diff)
1848L(22bytes):
1849	movl	-22(%rdi), %eax
1850	movl	-22(%rsi), %ecx
1851	cmp	%ecx, %eax
1852	jne	L(find_diff)
1853L(18bytes):
1854	movl	-18(%rdi), %eax
1855	movl	-18(%rsi), %ecx
1856	cmp	%ecx, %eax
1857	jne	L(find_diff)
1858L(14bytes):
1859	movl	-14(%rdi), %eax
1860	movl	-14(%rsi), %ecx
1861	cmp	%ecx, %eax
1862	jne	L(find_diff)
1863L(10bytes):
1864	movl	-10(%rdi), %eax
1865	movl	-10(%rsi), %ecx
1866	cmp	%ecx, %eax
1867	jne	L(find_diff)
1868L(6bytes):
1869	movl	-6(%rdi), %eax
1870	movl	-6(%rsi), %ecx
1871	cmp	%ecx, %eax
1872	jne	L(find_diff)
1873L(2bytes):
1874	movzwl	-2(%rdi), %eax
1875	movzwl	-2(%rsi), %ecx
1876	cmpb	%cl, %al
1877	jne	L(set)
1878	cmp	%ecx, %eax
1879	jne	L(set)
1880	xor	%eax, %eax
1881	ret
1882
1883	.p2align 4
1884L(47bytes):
1885	movl	-47(%rdi), %eax
1886	movl	-47(%rsi), %ecx
1887	cmp	%ecx, %eax
1888	jne	L(find_diff)
1889L(43bytes):
1890	movl	-43(%rdi), %eax
1891	movl	-43(%rsi), %ecx
1892	cmp	%ecx, %eax
1893	jne	L(find_diff)
1894L(39bytes):
1895	movl	-39(%rdi), %eax
1896	movl	-39(%rsi), %ecx
1897	cmp	%ecx, %eax
1898	jne	L(find_diff)
1899L(35bytes):
1900	movl	-35(%rdi), %eax
1901	movl	-35(%rsi), %ecx
1902	cmp	%ecx, %eax
1903	jne	L(find_diff)
1904L(31bytes):
1905	movl	-31(%rdi), %eax
1906	movl	-31(%rsi), %ecx
1907	cmp	%ecx, %eax
1908	jne	L(find_diff)
1909L(27bytes):
1910	movl	-27(%rdi), %eax
1911	movl	-27(%rsi), %ecx
1912	cmp	%ecx, %eax
1913	jne	L(find_diff)
1914L(23bytes):
1915	movl	-23(%rdi), %eax
1916	movl	-23(%rsi), %ecx
1917	cmp	%ecx, %eax
1918	jne	L(find_diff)
1919L(19bytes):
1920	movl	-19(%rdi), %eax
1921	movl	-19(%rsi), %ecx
1922	cmp	%ecx, %eax
1923	jne	L(find_diff)
1924L(15bytes):
1925	movl	-15(%rdi), %eax
1926	movl	-15(%rsi), %ecx
1927	cmp	%ecx, %eax
1928	jne	L(find_diff)
1929L(11bytes):
1930	movl	-11(%rdi), %eax
1931	movl	-11(%rsi), %ecx
1932	cmp	%ecx, %eax
1933	jne	L(find_diff)
1934L(7bytes):
1935	movl	-7(%rdi), %eax
1936	movl	-7(%rsi), %ecx
1937	cmp	%ecx, %eax
1938	jne	L(find_diff)
1939L(3bytes):
1940	movzwl	-3(%rdi), %eax
1941	movzwl	-3(%rsi), %ecx
1942	cmpb	%cl, %al
1943	jne	L(set)
1944	cmp	%ecx, %eax
1945	jne	L(set)
1946	movzbl	-1(%rdi), %eax
1947	cmpb	-1(%rsi), %al
1948	jne	L(set)
1949	xor	%eax, %eax
1950	ret
1951
1952	.p2align 4
1953L(find_diff):
1954	cmpb	%cl, %al
1955	jne	L(set)
1956	cmpw	%cx, %ax
1957	jne	L(set)
1958	shr	$16, %eax
1959	shr	$16, %ecx
1960	cmpb	%cl, %al
1961	jne	L(set)
1962
1963/* We get there only if we already know there is a
1964difference.  */
1965
1966	cmp	%ecx, %eax
1967L(set):
1968	sbb	%eax, %eax
1969	sbb	$-1, %eax
1970	ret
1971# else
1972
1973/* for wmemcmp */
1974	.p2align 4
1975L(find_diff):
1976	mov	$1, %eax
1977	jg	L(find_diff_bigger)
1978	neg	%eax
1979	ret
1980
1981	.p2align 4
1982L(find_diff_bigger):
1983	ret
1984# endif
1985
1986	.p2align 4
1987L(equal):
1988	xor	%eax, %eax
1989	ret
1990
1991END (MEMCMP)
1992#endif
1993