1/* Optimized wcscmp for x86-64 with SSE2.
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
22
23	.text
24ENTRY (__wcscmp)
25/*
26	* This implementation uses SSE to compare up to 16 bytes at a time.
27*/
28	mov	%esi, %eax
29	mov	%edi, %edx
30	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
31	mov	%al, %ch
32	mov	%dl, %cl
33	and	$63, %eax		/* rsi alignment in cache line */
34	and	$63, %edx		/* rdi alignment in cache line */
35	and	$15, %cl
36	jz	L(continue_00)
37	cmp	$16, %edx
38	jb	L(continue_0)
39	cmp	$32, %edx
40	jb	L(continue_16)
41	cmp	$48, %edx
42	jb	L(continue_32)
43
44L(continue_48):
45	and	$15, %ch
46	jz	L(continue_48_00)
47	cmp	$16, %eax
48	jb	L(continue_0_48)
49	cmp	$32, %eax
50	jb	L(continue_16_48)
51	cmp	$48, %eax
52	jb	L(continue_32_48)
53
54	.p2align 4
55L(continue_48_48):
56	mov	(%rsi), %ecx
57	cmp	%ecx, (%rdi)
58	jne	L(nequal)
59	test	%ecx, %ecx
60	jz	L(equal)
61
62	mov	4(%rsi), %ecx
63	cmp	%ecx, 4(%rdi)
64	jne	L(nequal)
65	test	%ecx, %ecx
66	jz	L(equal)
67
68	mov	8(%rsi), %ecx
69	cmp	%ecx, 8(%rdi)
70	jne	L(nequal)
71	test	%ecx, %ecx
72	jz	L(equal)
73
74	mov	12(%rsi), %ecx
75	cmp	%ecx, 12(%rdi)
76	jne	L(nequal)
77	test	%ecx, %ecx
78	jz	L(equal)
79
80	movdqu	16(%rdi), %xmm1
81	movdqu	16(%rsi), %xmm2
82	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
83	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
84	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
85	pmovmskb %xmm1, %edx
86	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
87	jnz	L(less4_double_words_16)
88
89	movdqu	32(%rdi), %xmm1
90	movdqu	32(%rsi), %xmm2
91	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
92	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
93	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
94	pmovmskb %xmm1, %edx
95	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
96	jnz	L(less4_double_words_32)
97
98	movdqu	48(%rdi), %xmm1
99	movdqu	48(%rsi), %xmm2
100	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
101	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
102	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
103	pmovmskb %xmm1, %edx
104	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
105	jnz	L(less4_double_words_48)
106
107	add	$64, %rsi
108	add	$64, %rdi
109	jmp	L(continue_48_48)
110
111L(continue_0):
112	and	$15, %ch
113	jz	L(continue_0_00)
114	cmp	$16, %eax
115	jb	L(continue_0_0)
116	cmp	$32, %eax
117	jb	L(continue_0_16)
118	cmp	$48, %eax
119	jb	L(continue_0_32)
120
121	.p2align 4
122L(continue_0_48):
123	mov	(%rsi), %ecx
124	cmp	%ecx, (%rdi)
125	jne	L(nequal)
126	test	%ecx, %ecx
127	jz	L(equal)
128
129	mov	4(%rsi), %ecx
130	cmp	%ecx, 4(%rdi)
131	jne	L(nequal)
132	test	%ecx, %ecx
133	jz	L(equal)
134
135	mov	8(%rsi), %ecx
136	cmp	%ecx, 8(%rdi)
137	jne	L(nequal)
138	test	%ecx, %ecx
139	jz	L(equal)
140
141	mov	12(%rsi), %ecx
142	cmp	%ecx, 12(%rdi)
143	jne	L(nequal)
144	test	%ecx, %ecx
145	jz	L(equal)
146
147	movdqu	16(%rdi), %xmm1
148	movdqu	16(%rsi), %xmm2
149	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
150	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
151	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
152	pmovmskb %xmm1, %edx
153	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
154	jnz	L(less4_double_words_16)
155
156	movdqu	32(%rdi), %xmm1
157	movdqu	32(%rsi), %xmm2
158	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
159	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
160	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
161	pmovmskb %xmm1, %edx
162	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
163	jnz	L(less4_double_words_32)
164
165	mov	48(%rsi), %ecx
166	cmp	%ecx, 48(%rdi)
167	jne	L(nequal)
168	test	%ecx, %ecx
169	jz	L(equal)
170
171	mov	52(%rsi), %ecx
172	cmp	%ecx, 52(%rdi)
173	jne	L(nequal)
174	test	%ecx, %ecx
175	jz	L(equal)
176
177	mov	56(%rsi), %ecx
178	cmp	%ecx, 56(%rdi)
179	jne	L(nequal)
180	test	%ecx, %ecx
181	jz	L(equal)
182
183	mov	60(%rsi), %ecx
184	cmp	%ecx, 60(%rdi)
185	jne	L(nequal)
186	test	%ecx, %ecx
187	jz	L(equal)
188
189	add	$64, %rsi
190	add	$64, %rdi
191	jmp	L(continue_0_48)
192
193	.p2align 4
194L(continue_00):
195	and	$15, %ch
196	jz	L(continue_00_00)
197	cmp	$16, %eax
198	jb	L(continue_00_0)
199	cmp	$32, %eax
200	jb	L(continue_00_16)
201	cmp	$48, %eax
202	jb	L(continue_00_32)
203
204	.p2align 4
205L(continue_00_48):
206	pcmpeqd	(%rdi), %xmm0
207	mov	(%rdi), %eax
208	pmovmskb %xmm0, %ecx
209	test	%ecx, %ecx
210	jnz	L(less4_double_words1)
211
212	cmp	(%rsi), %eax
213	jne	L(nequal)
214
215	mov	4(%rdi), %eax
216	cmp	4(%rsi), %eax
217	jne	L(nequal)
218
219	mov	8(%rdi), %eax
220	cmp	8(%rsi), %eax
221	jne	L(nequal)
222
223	mov	12(%rdi), %eax
224	cmp	12(%rsi), %eax
225	jne	L(nequal)
226
227	movdqu	16(%rsi), %xmm2
228	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
229	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
230	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
231	pmovmskb %xmm2, %edx
232	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
233	jnz	L(less4_double_words_16)
234
235	movdqu	32(%rsi), %xmm2
236	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
237	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
238	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
239	pmovmskb %xmm2, %edx
240	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
241	jnz	L(less4_double_words_32)
242
243	movdqu	48(%rsi), %xmm2
244	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
245	pcmpeqd	48(%rdi), %xmm2		/* compare first 4 double_words for equality */
246	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
247	pmovmskb %xmm2, %edx
248	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
249	jnz	L(less4_double_words_48)
250
251	add	$64, %rsi
252	add	$64, %rdi
253	jmp	L(continue_00_48)
254
255	.p2align 4
256L(continue_32):
257	and	$15, %ch
258	jz	L(continue_32_00)
259	cmp	$16, %eax
260	jb	L(continue_0_32)
261	cmp	$32, %eax
262	jb	L(continue_16_32)
263	cmp	$48, %eax
264	jb	L(continue_32_32)
265
266	.p2align 4
267L(continue_32_48):
268	mov	(%rsi), %ecx
269	cmp	%ecx, (%rdi)
270	jne	L(nequal)
271	test	%ecx, %ecx
272	jz	L(equal)
273
274	mov	4(%rsi), %ecx
275	cmp	%ecx, 4(%rdi)
276	jne	L(nequal)
277	test	%ecx, %ecx
278	jz	L(equal)
279
280	mov	8(%rsi), %ecx
281	cmp	%ecx, 8(%rdi)
282	jne	L(nequal)
283	test	%ecx, %ecx
284	jz	L(equal)
285
286	mov	12(%rsi), %ecx
287	cmp	%ecx, 12(%rdi)
288	jne	L(nequal)
289	test	%ecx, %ecx
290	jz	L(equal)
291
292	mov	16(%rsi), %ecx
293	cmp	%ecx, 16(%rdi)
294	jne	L(nequal)
295	test	%ecx, %ecx
296	jz	L(equal)
297
298	mov	20(%rsi), %ecx
299	cmp	%ecx, 20(%rdi)
300	jne	L(nequal)
301	test	%ecx, %ecx
302	jz	L(equal)
303
304	mov	24(%rsi), %ecx
305	cmp	%ecx, 24(%rdi)
306	jne	L(nequal)
307	test	%ecx, %ecx
308	jz	L(equal)
309
310	mov	28(%rsi), %ecx
311	cmp	%ecx, 28(%rdi)
312	jne	L(nequal)
313	test	%ecx, %ecx
314	jz	L(equal)
315
316	movdqu	32(%rdi), %xmm1
317	movdqu	32(%rsi), %xmm2
318	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
319	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
320	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
321	pmovmskb %xmm1, %edx
322	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
323	jnz	L(less4_double_words_32)
324
325	movdqu	48(%rdi), %xmm1
326	movdqu	48(%rsi), %xmm2
327	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
328	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
329	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
330	pmovmskb %xmm1, %edx
331	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
332	jnz	L(less4_double_words_48)
333
334	add	$64, %rsi
335	add	$64, %rdi
336	jmp	L(continue_32_48)
337
338	.p2align 4
339L(continue_16):
340	and	$15, %ch
341	jz	L(continue_16_00)
342	cmp	$16, %eax
343	jb	L(continue_0_16)
344	cmp	$32, %eax
345	jb	L(continue_16_16)
346	cmp	$48, %eax
347	jb	L(continue_16_32)
348
349	.p2align 4
350L(continue_16_48):
351	mov	(%rsi), %ecx
352	cmp	%ecx, (%rdi)
353	jne	L(nequal)
354	test	%ecx, %ecx
355	jz	L(equal)
356
357	mov	4(%rsi), %ecx
358	cmp	%ecx, 4(%rdi)
359	jne	L(nequal)
360	test	%ecx, %ecx
361	jz	L(equal)
362
363	mov	8(%rsi), %ecx
364	cmp	%ecx, 8(%rdi)
365	jne	L(nequal)
366	test	%ecx, %ecx
367	jz	L(equal)
368
369	mov	12(%rsi), %ecx
370	cmp	%ecx, 12(%rdi)
371	jne	L(nequal)
372	test	%ecx, %ecx
373	jz	L(equal)
374
375	movdqu	16(%rdi), %xmm1
376	movdqu	16(%rsi), %xmm2
377	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
378	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
379	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
380	pmovmskb %xmm1, %edx
381	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
382	jnz	L(less4_double_words_16)
383
384	mov	32(%rsi), %ecx
385	cmp	%ecx, 32(%rdi)
386	jne	L(nequal)
387	test	%ecx, %ecx
388	jz	L(equal)
389
390	mov	36(%rsi), %ecx
391	cmp	%ecx, 36(%rdi)
392	jne	L(nequal)
393	test	%ecx, %ecx
394	jz	L(equal)
395
396	mov	40(%rsi), %ecx
397	cmp	%ecx, 40(%rdi)
398	jne	L(nequal)
399	test	%ecx, %ecx
400	jz	L(equal)
401
402	mov	44(%rsi), %ecx
403	cmp	%ecx, 44(%rdi)
404	jne	L(nequal)
405	test	%ecx, %ecx
406	jz	L(equal)
407
408	movdqu	48(%rdi), %xmm1
409	movdqu	48(%rsi), %xmm2
410	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
411	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
412	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
413	pmovmskb %xmm1, %edx
414	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
415	jnz	L(less4_double_words_48)
416
417	add	$64, %rsi
418	add	$64, %rdi
419	jmp	L(continue_16_48)
420
421	.p2align 4
422L(continue_00_00):
423	movdqa	(%rdi), %xmm1
424	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
425	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
426	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
427	pmovmskb %xmm1, %edx
428	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
429	jnz	L(less4_double_words)
430
431	movdqa	16(%rdi), %xmm3
432	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
433	pcmpeqd	16(%rsi), %xmm3		/* compare first 4 double_words for equality */
434	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
435	pmovmskb %xmm3, %edx
436	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
437	jnz	L(less4_double_words_16)
438
439	movdqa	32(%rdi), %xmm5
440	pcmpeqd	%xmm5, %xmm0		/* Any null double_word? */
441	pcmpeqd	32(%rsi), %xmm5		/* compare first 4 double_words for equality */
442	psubb	%xmm0, %xmm5		/* packed sub of comparison results*/
443	pmovmskb %xmm5, %edx
444	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
445	jnz	L(less4_double_words_32)
446
447	movdqa	48(%rdi), %xmm1
448	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
449	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
450	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
451	pmovmskb %xmm1, %edx
452	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
453	jnz	L(less4_double_words_48)
454
455	add	$64, %rsi
456	add	$64, %rdi
457	jmp	L(continue_00_00)
458
459	.p2align 4
460L(continue_00_32):
461	movdqu	(%rsi), %xmm2
462	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
463	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
464	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
465	pmovmskb %xmm2, %edx
466	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
467	jnz	L(less4_double_words)
468
469	add	$16, %rsi
470	add	$16, %rdi
471	jmp	L(continue_00_48)
472
473	.p2align 4
474L(continue_00_16):
475	movdqu	(%rsi), %xmm2
476	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
477	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
478	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
479	pmovmskb %xmm2, %edx
480	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
481	jnz	L(less4_double_words)
482
483	movdqu	16(%rsi), %xmm2
484	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
485	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
486	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
487	pmovmskb %xmm2, %edx
488	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
489	jnz	L(less4_double_words_16)
490
491	add	$32, %rsi
492	add	$32, %rdi
493	jmp	L(continue_00_48)
494
495	.p2align 4
496L(continue_00_0):
497	movdqu	(%rsi), %xmm2
498	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
499	pcmpeqd	(%rdi), %xmm2		/* compare first 4 double_words for equality */
500	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
501	pmovmskb %xmm2, %edx
502	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
503	jnz	L(less4_double_words)
504
505	movdqu	16(%rsi), %xmm2
506	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
507	pcmpeqd	16(%rdi), %xmm2		/* compare first 4 double_words for equality */
508	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
509	pmovmskb %xmm2, %edx
510	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
511	jnz	L(less4_double_words_16)
512
513	movdqu	32(%rsi), %xmm2
514	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
515	pcmpeqd	32(%rdi), %xmm2		/* compare first 4 double_words for equality */
516	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
517	pmovmskb %xmm2, %edx
518	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
519	jnz	L(less4_double_words_32)
520
521	add	$48, %rsi
522	add	$48, %rdi
523	jmp	L(continue_00_48)
524
525	.p2align 4
526L(continue_48_00):
527	pcmpeqd	(%rsi), %xmm0
528	mov	(%rdi), %eax
529	pmovmskb %xmm0, %ecx
530	test	%ecx, %ecx
531	jnz	L(less4_double_words1)
532
533	cmp	(%rsi), %eax
534	jne	L(nequal)
535
536	mov	4(%rdi), %eax
537	cmp	4(%rsi), %eax
538	jne	L(nequal)
539
540	mov	8(%rdi), %eax
541	cmp	8(%rsi), %eax
542	jne	L(nequal)
543
544	mov	12(%rdi), %eax
545	cmp	12(%rsi), %eax
546	jne	L(nequal)
547
548	movdqu	16(%rdi), %xmm1
549	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
550	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
551	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
552	pmovmskb %xmm1, %edx
553	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
554	jnz	L(less4_double_words_16)
555
556	movdqu	32(%rdi), %xmm1
557	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
558	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
559	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
560	pmovmskb %xmm1, %edx
561	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
562	jnz	L(less4_double_words_32)
563
564	movdqu	48(%rdi), %xmm1
565	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
566	pcmpeqd	48(%rsi), %xmm1		/* compare first 4 double_words for equality */
567	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
568	pmovmskb %xmm1, %edx
569	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
570	jnz	L(less4_double_words_48)
571
572	add	$64, %rsi
573	add	$64, %rdi
574	jmp	L(continue_48_00)
575
576	.p2align 4
577L(continue_32_00):
578	movdqu	(%rdi), %xmm1
579	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
580	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
581	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
582	pmovmskb %xmm1, %edx
583	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
584	jnz	L(less4_double_words)
585
586	add	$16, %rsi
587	add	$16, %rdi
588	jmp	L(continue_48_00)
589
590	.p2align 4
591L(continue_16_00):
592	movdqu	(%rdi), %xmm1
593	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
594	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
595	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
596	pmovmskb %xmm1, %edx
597	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
598	jnz	L(less4_double_words)
599
600	movdqu	16(%rdi), %xmm1
601	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
602	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
603	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
604	pmovmskb %xmm1, %edx
605	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
606	jnz	L(less4_double_words_16)
607
608	add	$32, %rsi
609	add	$32, %rdi
610	jmp	L(continue_48_00)
611
612	.p2align 4
613L(continue_0_00):
614	movdqu	(%rdi), %xmm1
615	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
616	pcmpeqd	(%rsi), %xmm1		/* compare first 4 double_words for equality */
617	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
618	pmovmskb %xmm1, %edx
619	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
620	jnz	L(less4_double_words)
621
622	movdqu	16(%rdi), %xmm1
623	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
624	pcmpeqd	16(%rsi), %xmm1		/* compare first 4 double_words for equality */
625	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
626	pmovmskb %xmm1, %edx
627	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
628	jnz	L(less4_double_words_16)
629
630	movdqu	32(%rdi), %xmm1
631	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
632	pcmpeqd	32(%rsi), %xmm1		/* compare first 4 double_words for equality */
633	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
634	pmovmskb %xmm1, %edx
635	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
636	jnz	L(less4_double_words_32)
637
638	add	$48, %rsi
639	add	$48, %rdi
640	jmp	L(continue_48_00)
641
642	.p2align 4
643L(continue_32_32):
644	movdqu	(%rdi), %xmm1
645	movdqu	(%rsi), %xmm2
646	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
647	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
648	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
649	pmovmskb %xmm1, %edx
650	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
651	jnz	L(less4_double_words)
652
653	add	$16, %rsi
654	add	$16, %rdi
655	jmp	L(continue_48_48)
656
657	.p2align 4
658L(continue_16_16):
659	movdqu	(%rdi), %xmm1
660	movdqu	(%rsi), %xmm2
661	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
662	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
663	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
664	pmovmskb %xmm1, %edx
665	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
666	jnz	L(less4_double_words)
667
668	movdqu	16(%rdi), %xmm3
669	movdqu	16(%rsi), %xmm4
670	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
671	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
672	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
673	pmovmskb %xmm3, %edx
674	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
675	jnz	L(less4_double_words_16)
676
677	add	$32, %rsi
678	add	$32, %rdi
679	jmp	L(continue_48_48)
680
681	.p2align 4
682L(continue_0_0):
683	movdqu	(%rdi), %xmm1
684	movdqu	(%rsi), %xmm2
685	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
686	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
687	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
688	pmovmskb %xmm1, %edx
689	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
690	jnz	L(less4_double_words)
691
692	movdqu	16(%rdi), %xmm3
693	movdqu	16(%rsi), %xmm4
694	pcmpeqd	%xmm3, %xmm0		/* Any null double_word? */
695	pcmpeqd	%xmm4, %xmm3		/* compare first 4 double_words for equality */
696	psubb	%xmm0, %xmm3		/* packed sub of comparison results*/
697	pmovmskb %xmm3, %edx
698	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
699	jnz	L(less4_double_words_16)
700
701	movdqu	32(%rdi), %xmm1
702	movdqu	32(%rsi), %xmm2
703	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
704	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
705	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
706	pmovmskb %xmm1, %edx
707	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
708	jnz	L(less4_double_words_32)
709
710	add	$48, %rsi
711	add	$48, %rdi
712	jmp	L(continue_48_48)
713
714	.p2align 4
715L(continue_0_16):
716	movdqu	(%rdi), %xmm1
717	movdqu	(%rsi), %xmm2
718	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
719	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
720	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
721	pmovmskb %xmm1, %edx
722	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
723	jnz	L(less4_double_words)
724
725	movdqu	16(%rdi), %xmm1
726	movdqu	16(%rsi), %xmm2
727	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
728	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
729	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
730	pmovmskb %xmm1, %edx
731	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
732	jnz	L(less4_double_words_16)
733
734	add	$32, %rsi
735	add	$32, %rdi
736	jmp	L(continue_32_48)
737
738	.p2align 4
739L(continue_0_32):
740	movdqu	(%rdi), %xmm1
741	movdqu	(%rsi), %xmm2
742	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
743	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
744	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
745	pmovmskb %xmm1, %edx
746	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
747	jnz	L(less4_double_words)
748
749	add	$16, %rsi
750	add	$16, %rdi
751	jmp	L(continue_16_48)
752
753	.p2align 4
754L(continue_16_32):
755	movdqu	(%rdi), %xmm1
756	movdqu	(%rsi), %xmm2
757	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
758	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
759	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
760	pmovmskb %xmm1, %edx
761	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
762	jnz	L(less4_double_words)
763
764	add	$16, %rsi
765	add	$16, %rdi
766	jmp	L(continue_32_48)
767
768	.p2align 4
769L(less4_double_words1):
770	cmp	(%rsi), %eax
771	jne	L(nequal)
772	test	%eax, %eax
773	jz	L(equal)
774
775	mov	4(%rsi), %ecx
776	cmp	%ecx, 4(%rdi)
777	jne	L(nequal)
778	test	%ecx, %ecx
779	jz	L(equal)
780
781	mov	8(%rsi), %ecx
782	cmp	%ecx, 8(%rdi)
783	jne	L(nequal)
784	test	%ecx, %ecx
785	jz	L(equal)
786
787	mov	12(%rsi), %ecx
788	cmp	%ecx, 12(%rdi)
789	jne	L(nequal)
790	xor	%eax, %eax
791	ret
792
793	.p2align 4
794L(less4_double_words):
795	xor	%eax, %eax
796	test	%dl, %dl
797	jz	L(next_two_double_words)
798	and	$15, %dl
799	jz	L(second_double_word)
800	mov	(%rdi), %eax
801	cmp	(%rsi), %eax
802	jne	L(nequal)
803	ret
804
805	.p2align 4
806L(second_double_word):
807	mov	4(%rdi), %eax
808	cmp	4(%rsi), %eax
809	jne	L(nequal)
810	ret
811
812	.p2align 4
813L(next_two_double_words):
814	and	$15, %dh
815	jz	L(fourth_double_word)
816	mov	8(%rdi), %eax
817	cmp	8(%rsi), %eax
818	jne	L(nequal)
819	ret
820
821	.p2align 4
822L(fourth_double_word):
823	mov	12(%rdi), %eax
824	cmp	12(%rsi), %eax
825	jne	L(nequal)
826	ret
827
828	.p2align 4
829L(less4_double_words_16):
830	xor	%eax, %eax
831	test	%dl, %dl
832	jz	L(next_two_double_words_16)
833	and	$15, %dl
834	jz	L(second_double_word_16)
835	mov	16(%rdi), %eax
836	cmp	16(%rsi), %eax
837	jne	L(nequal)
838	ret
839
840	.p2align 4
841L(second_double_word_16):
842	mov	20(%rdi), %eax
843	cmp	20(%rsi), %eax
844	jne	L(nequal)
845	ret
846
847	.p2align 4
848L(next_two_double_words_16):
849	and	$15, %dh
850	jz	L(fourth_double_word_16)
851	mov	24(%rdi), %eax
852	cmp	24(%rsi), %eax
853	jne	L(nequal)
854	ret
855
856	.p2align 4
857L(fourth_double_word_16):
858	mov	28(%rdi), %eax
859	cmp	28(%rsi), %eax
860	jne	L(nequal)
861	ret
862
863	.p2align 4
864L(less4_double_words_32):
865	xor	%eax, %eax
866	test	%dl, %dl
867	jz	L(next_two_double_words_32)
868	and	$15, %dl
869	jz	L(second_double_word_32)
870	mov	32(%rdi), %eax
871	cmp	32(%rsi), %eax
872	jne	L(nequal)
873	ret
874
875	.p2align 4
876L(second_double_word_32):
877	mov	36(%rdi), %eax
878	cmp	36(%rsi), %eax
879	jne	L(nequal)
880	ret
881
882	.p2align 4
883L(next_two_double_words_32):
884	and	$15, %dh
885	jz	L(fourth_double_word_32)
886	mov	40(%rdi), %eax
887	cmp	40(%rsi), %eax
888	jne	L(nequal)
889	ret
890
891	.p2align 4
892L(fourth_double_word_32):
893	mov	44(%rdi), %eax
894	cmp	44(%rsi), %eax
895	jne	L(nequal)
896	ret
897
898	.p2align 4
899L(less4_double_words_48):
900	xor	%eax, %eax
901	test	%dl, %dl
902	jz	L(next_two_double_words_48)
903	and	$15, %dl
904	jz	L(second_double_word_48)
905	mov	48(%rdi), %eax
906	cmp	48(%rsi), %eax
907	jne	L(nequal)
908	ret
909
910	.p2align 4
911L(second_double_word_48):
912	mov	52(%rdi), %eax
913	cmp	52(%rsi), %eax
914	jne	L(nequal)
915	ret
916
917	.p2align 4
918L(next_two_double_words_48):
919	and	$15, %dh
920	jz	L(fourth_double_word_48)
921	mov	56(%rdi), %eax
922	cmp	56(%rsi), %eax
923	jne	L(nequal)
924	ret
925
926	.p2align 4
927L(fourth_double_word_48):
928	mov	60(%rdi), %eax
929	cmp	60(%rsi), %eax
930	jne	L(nequal)
931	ret
932
933	.p2align 4
934L(nequal):
935	mov	$1, %eax
936	jg	L(nequal_bigger)
937	neg	%eax
938
939L(nequal_bigger):
940	ret
941
942	.p2align 4
943L(equal):
944	xor	%rax, %rax
945	ret
946
947END (__wcscmp)
948#ifndef __wcscmp
949libc_hidden_def (__wcscmp)
950weak_alias (__wcscmp, wcscmp)
951#endif
952