1/* Optimized memcmp implementation for POWER7/PowerPC64.
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22		    const char *s2 [r4],
23		    size_t size [r5])  */
24
25#ifndef MEMCMP
26# define MEMCMP memcmp
27#endif
28	.machine  power8
29ENTRY_TOCLESS (MEMCMP, 4)
30	CALL_MCOUNT 3
31
32#define rRTN		r3
33#define rSTR1		r3	/* First string arg.  */
34#define rSTR2		r4	/* Second string arg.  */
35#define rN		r5	/* Max string length.  */
36#define rWORD1		r6	/* Current word in s1.  */
37#define rWORD2		r7	/* Current word in s2.  */
38#define rWORD3		r8	/* Next word in s1.  */
39#define rWORD4		r9	/* Next word in s2.  */
40#define rWORD5		r10	/* Next word in s1.  */
41#define rWORD6		r11	/* Next word in s2.  */
42
43#define rOFF8		r20	/* 8 bytes offset.  */
44#define rOFF16  	r21	/* 16 bytes offset.  */
45#define rOFF24		r22	/* 24 bytes offset.  */
46#define rOFF32		r23	/* 24 bytes offset.  */
47#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
48#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
49#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
50#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
51#define rSHR		r28	/* Unaligned shift right count.  */
52#define rSHL		r29	/* Unaligned shift left count.  */
53#define rWORD7		r30	/* Next word in s1.  */
54#define rWORD8		r31	/* Next word in s2.  */
55
56#define rWORD8SAVE	(-8)
57#define rWORD7SAVE	(-16)
58#define rOFF8SAVE	(-24)
59#define rOFF16SAVE	(-32)
60#define rOFF24SAVE	(-40)
61#define rOFF32SAVE	(-48)
62#define rSHRSAVE	(-56)
63#define rSHLSAVE	(-64)
64#define rWORD8SHIFTSAVE	(-72)
65#define rWORD2SHIFTSAVE	(-80)
66#define rWORD4SHIFTSAVE	(-88)
67#define rWORD6SHIFTSAVE	(-96)
68
69#ifdef __LITTLE_ENDIAN__
70# define LD	ldbrx
71#else
72# define LD	ldx
73#endif
74
75	xor	r10, rSTR2, rSTR1
76	cmpldi	cr6, rN, 0
77	cmpldi	cr1, rN, 8
78	clrldi.	r0, r10, 61
79	clrldi	r12, rSTR1, 61
80	cmpldi	cr5, r12, 0
81	beq-	cr6, L(zeroLength)
82	dcbt	0, rSTR1
83	dcbt	0, rSTR2
84	/* If less than 8 bytes or not aligned, use the unaligned
85	   byte loop.  */
86	blt	cr1, L(bytealigned)
87	bne	L(unalignedqw)
88/* At this point we know both strings have the same alignment and the
89   compare length is at least 8 bytes.  r12 contains the low order
90   3 bits of rSTR1 and cr5 contains the result of the logical compare
91   of r12 to 0.  If r12 == 0 then we are already double word
92   aligned and can perform the DW aligned loop.  */
93
94	.align	4
95L(samealignment):
96	or	r11, rSTR2, rSTR1
97	clrldi.	r11, r11, 60
98	beq	L(qw_align)
99	/* Try to align to QW else proceed to DW loop.  */
100	clrldi.	r10, r10, 60
101	bne	L(DW)
102	/* For the difference to reach QW alignment, load as DW.  */
103	clrrdi	rSTR1, rSTR1, 3
104	clrrdi	rSTR2, rSTR2, 3
105	subfic	r10, r12, 8
106	LD	rWORD1, 0, rSTR1
107	LD	rWORD2, 0, rSTR2
108	sldi	r9, r10, 3
109	subfic	r9, r9, 64
110	sld	rWORD1, rWORD1, r9
111	sld	rWORD2, rWORD2, r9
112	cmpld	cr6, rWORD1, rWORD2
113	addi	rSTR1, rSTR1, 8
114	addi	rSTR2, rSTR2, 8
115	bne	cr6, L(ret_diff)
116	subf	rN, r10, rN
117
118	cmpld	cr6, r11, r12
119	bgt	cr6, L(qw_align)
120	LD	rWORD1, 0, rSTR1
121	LD	rWORD2, 0, rSTR2
122	cmpld	cr6, rWORD1, rWORD2
123	addi	rSTR1, rSTR1, 8
124	addi	rSTR2, rSTR2, 8
125	bne	cr6, L(different)
126	cmpldi	cr6, rN, 8
127	ble	cr6, L(zeroLength)
128	addi	rN, rN, -8
129	/* Now both rSTR1 and rSTR2 are aligned to QW.  */
130	.align	4
131L(qw_align):
132	vspltisb	v0, 0
133	srdi.	r6, rN, 6
134	li	r8, 16
135	li	r10, 32
136	li	r11, 48
137	ble	cr0, L(lessthan64)
138	mtctr	r6
139	vspltisb	v8, 0
140	vspltisb	v6, 0
141	/* Aligned vector loop.  */
142	.align	4
143L(aligned_loop):
144	lvx	v4, 0, rSTR1
145	lvx	v5, 0, rSTR2
146	vcmpequb.	v7, v6, v8
147	bnl	cr6, L(different3)
148	lvx	v6, rSTR1, r8
149	lvx	v8, rSTR2, r8
150	vcmpequb.	v7, v5, v4
151	bnl	cr6, L(different2)
152	lvx	v4, rSTR1, r10
153	lvx	v5, rSTR2, r10
154	vcmpequb.	v7, v6, v8
155	bnl	cr6, L(different3)
156	lvx	v6, rSTR1, r11
157	lvx	v8, rSTR2, r11
158	vcmpequb.	v7, v5, v4
159	bnl	cr6, L(different2)
160	addi	rSTR1, rSTR1, 64
161	addi	rSTR2, rSTR2, 64
162	bdnz	L(aligned_loop)
163	vcmpequb.	v7, v6, v8
164	bnl	cr6, L(different3)
165	clrldi	rN, rN, 58
166	/* Handle remainder for aligned loop.  */
167	.align	4
168L(lessthan64):
169	mr	r9, rSTR1
170	cmpdi	cr6, rN, 0
171	li	rSTR1, 0
172	blelr	cr6
173	lvx	v4, 0, r9
174	lvx	v5, 0, rSTR2
175	vcmpequb.	v7, v5, v4
176	bnl	cr6, L(different1)
177	addi	rN, rN, -16
178
179	cmpdi	cr6, rN, 0
180	blelr	cr6
181	lvx	v4, r9, r8
182	lvx	v5, rSTR2, r8
183	vcmpequb.	v7, v5, v4
184	bnl	cr6, L(different1)
185	addi	rN, rN, -16
186
187	cmpdi	cr6, rN, 0
188	blelr	cr6
189	lvx	v4, r9, r10
190	lvx	v5, rSTR2, r10
191	vcmpequb.	v7, v5, v4
192	bnl	cr6, L(different1)
193	addi	rN, rN, -16
194
195	cmpdi	cr6, rN, 0
196	blelr	cr6
197	lvx	v4, r9, r11
198	lvx	v5, rSTR2, r11
199	vcmpequb.	v7, v5, v4
200	bnl	cr6, L(different1)
201	blr
202
203	/* Calculate and return the difference.  */
204	.align 4
205L(different1):
206	cmpdi	cr6, rN, 16
207	bge	cr6, L(different2)
208	/* Discard unwanted bytes.  */
209#ifdef __LITTLE_ENDIAN__
210	lvsr	v1, 0, rN
211	vperm	v4, v4, v0, v1
212	vperm	v5, v5, v0, v1
213#else
214	lvsl	v1, 0, rN
215	vperm	v4, v0, v4, v1
216	vperm	v5, v0, v5, v1
217#endif
218	vcmpequb.	v7, v4, v5
219	li	rRTN, 0
220	bltlr	cr6
221	.align 4
222L(different2):
223#ifdef __LITTLE_ENDIAN__
224	/* Reverse bytes for direct comparison.  */
225	lvsl	v10, r0, r0
226	vspltisb	v8, 15
227	vsububm	v9, v8, v10
228	vperm	v4, v4, v0, v9
229	vperm	v5, v5, v0, v9
230#endif
231	mfvrd	r7, v4
232	mfvrd	r9, v5
233	cmpld	cr6, r7, r9
234	bne	cr6, L(ret_diff)
235	/* Difference in second DW.  */
236	vsldoi	v4, v4, v4, 8
237	vsldoi	v5, v5, v5, 8
238	mfvrd	r7, v4
239	mfvrd	r9, v5
240	cmpld	cr6, r7, r9
241L(ret_diff):
242	li	rRTN, 1
243	bgtlr	cr6
244	li	rRTN, -1
245	blr
246	.align	4
247L(different3):
248#ifdef __LITTLE_ENDIAN__
249	/* Reverse bytes for direct comparison.  */
250	vspltisb	v9, 15
251	lvsl	v10, r0, r0
252	vsububm	v9, v9, v10
253	vperm	v6, v6, v0, v9
254	vperm	v8, v8, v0, v9
255#endif
256	mfvrd	r7, v6
257	mfvrd	r9, v8
258	cmpld	cr6, r7, r9
259	bne	cr6, L(ret_diff)
260	/* Difference in second DW.  */
261	vsldoi	v6, v6, v6, 8
262	vsldoi	v8, v8, v8, 8
263	mfvrd	r7, v6
264	mfvrd	r9, v8
265	cmpld	cr6, r7, r9
266	li	rRTN, 1
267	bgtlr	cr6
268	li	rRTN, -1
269	blr
270
271	.align 4
272L(different):
273	cmpldi	cr7, rN, 8
274	bgt	cr7, L(end)
275	/* Skip unwanted bytes.  */
276	sldi	r8, rN, 3
277	subfic	r8, r8, 64
278	srd	rWORD1, rWORD1, r8
279	srd	rWORD2, rWORD2, r8
280	cmpld	cr6, rWORD1, rWORD2
281	li	rRTN, 0
282	beqlr	cr6
283L(end):
284	li	rRTN, 1
285	bgtlr	cr6
286	li	rRTN, -1
287	blr
288
289	.align	4
290L(unalignedqw):
291	/* Proceed to DW unaligned loop,if there is a chance of pagecross.  */
292	rldicl	r9, rSTR1, 0, 52
293	add	r9, r9, rN
294	cmpldi	cr0, r9, 4096-16
295	bgt	cr0, L(unaligned)
296	rldicl	r9, rSTR2, 0, 52
297	add	r9, r9, rN
298	cmpldi	cr0, r9, 4096-16
299	bgt	cr0, L(unaligned)
300	li	r0, 0
301	li	r8, 16
302	vspltisb	v0, 0
303	/* Check if rSTR1 is aligned to QW.  */
304	andi.	r11, rSTR1, 0xF
305	beq	L(s1_align)
306
307	/* Compare 16B and align S1 to QW.  */
308#ifdef __LITTLE_ENDIAN__
309	lvsr	v10, 0, rSTR1	/* Compute mask.  */
310	lvsr	v6, 0, rSTR2	/* Compute mask.  */
311#else
312	lvsl	v10, 0, rSTR1	/* Compute mask.  */
313	lvsl	v6, 0, rSTR2	/* Compute mask.  */
314#endif
315	lvx	v5, 0, rSTR2
316	lvx	v9, rSTR2, r8
317#ifdef __LITTLE_ENDIAN__
318	vperm	v5, v9, v5, v6
319#else
320	vperm	v5, v5, v9, v6
321#endif
322	lvx	v4, 0, rSTR1
323	lvx	v9, rSTR1, r8
324#ifdef __LITTLE_ENDIAN__
325	vperm	v4, v9, v4, v10
326#else
327	vperm	v4, v4, v9, v10
328#endif
329	vcmpequb.	v7, v5, v4
330	bnl	cr6, L(different1)
331	cmpldi	cr6, rN, 16
332	ble	cr6, L(zeroLength)
333	subfic	r11, r11, 16
334	subf	rN, r11, rN
335	add	rSTR1, rSTR1, r11
336	add	rSTR2, rSTR2, r11
337
338	/* As s1 is QW aligned prepare for unaligned loop.  */
339	.align	4
340L(s1_align):
341#ifdef __LITTLE_ENDIAN__
342	lvsr	v6, 0, rSTR2
343#else
344	lvsl	v6, 0, rSTR2
345#endif
346	lvx	v5, 0, rSTR2
347	srdi.	r6, rN, 6
348	li	r10, 32
349	li	r11, 48
350	ble	cr0, L(lessthan64_unalign)
351	mtctr	r6
352	li 	r9, 64
353	/* Unaligned vector loop.  */
354	.align	4
355L(unalign_qwloop):
356	lvx	v4, 0, rSTR1
357	lvx	v10, rSTR2, r8
358#ifdef __LITTLE_ENDIAN__
359	vperm	v5, v10, v5, v6
360#else
361	vperm	v5, v5, v10, v6
362#endif
363	vcmpequb.	v7, v5, v4
364	bnl	cr6, L(different2)
365	vor	v5, v10, v10
366	lvx	v4, rSTR1, r8
367	lvx	v10, rSTR2, r10
368#ifdef __LITTLE_ENDIAN__
369	vperm	v5, v10, v5, v6
370#else
371	vperm	v5, v5, v10, v6
372#endif
373	vcmpequb.	v7, v5, v4
374	bnl	cr6, L(different2)
375	vor	v5, v10, v10
376	lvx	v4, rSTR1, r10
377	lvx	v10, rSTR2, r11
378#ifdef __LITTLE_ENDIAN__
379	vperm	v5, v10, v5, v6
380#else
381	vperm	v5, v5, v10, v6
382#endif
383	vcmpequb.	v7, v5, v4
384	bnl	cr6, L(different2)
385	vor	v5, v10, v10
386	lvx	v4, rSTR1, r11
387	lvx	v10, rSTR2, r9
388#ifdef __LITTLE_ENDIAN__
389	vperm	v5, v10, v5, v6
390#else
391	vperm	v5, v5, v10, v6
392#endif
393	vcmpequb.	v7, v5, v4
394	bnl	cr6, L(different2)
395	vor	v5, v10, v10
396	addi	rSTR1, rSTR1, 64
397	addi	rSTR2, rSTR2, 64
398	bdnz	L(unalign_qwloop)
399	clrldi	rN, rN, 58
400	/* Handle remainder for unaligned loop.  */
401	.align	4
402L(lessthan64_unalign):
403	mr	r9, rSTR1
404	cmpdi	cr6, rN, 0
405	li	rSTR1, 0
406	blelr	cr6
407	lvx	v4, 0, r9
408	lvx     v10, rSTR2, r8
409#ifdef __LITTLE_ENDIAN__
410	vperm	v5, v10, v5, v6
411#else
412	vperm	v5, v5, v10, v6
413#endif
414	vcmpequb.	v7, v5, v4
415	bnl	cr6, L(different1)
416	vor	v5, v10, v10
417	addi	rN, rN, -16
418
419	cmpdi	cr6, rN, 0
420	blelr	cr6
421	lvx	v4, r9, r8
422	lvx	v10, rSTR2, r10
423#ifdef __LITTLE_ENDIAN__
424	vperm	v5, v10, v5, v6
425#else
426	vperm	v5, v5, v10, v6
427#endif
428	vcmpequb.	v7, v5, v4
429	bnl	cr6, L(different1)
430	vor	v5, v10, v10
431	addi	rN, rN, -16
432
433	cmpdi	cr6, rN, 0
434	blelr	cr6
435	lvx	v4, r9, r10
436	lvx	v10, rSTR2, r11
437#ifdef __LITTLE_ENDIAN__
438	vperm	v5, v10, v5, v6
439#else
440	vperm	v5, v5, v10, v6
441#endif
442	vcmpequb.	v7, v5, v4
443	bnl	cr6, L(different1)
444	vor	v5, v10, v10
445	addi	rN, rN, -16
446
447	cmpdi	cr6, rN, 0
448	blelr	cr6
449	lvx	v4, r9, r11
450	addi	r11, r11, 16
451	lvx	v10, rSTR2, r11
452#ifdef __LITTLE_ENDIAN__
453	vperm	v5, v10, v5, v6
454#else
455	vperm	v5, v5, v10, v6
456#endif
457	vcmpequb.	v7, v5, v4
458	bnl	cr6, L(different1)
459	blr
460
461/* Otherwise we know the two strings have the same alignment (but not
462   yet DW).  So we force the string addresses to the next lower DW
463   boundary and special case this first DW using shift left to
464   eliminate bits preceding the first byte.  Since we want to join the
465   normal (DW aligned) compare loop, starting at the second double word,
466   we need to adjust the length (rN) and special case the loop
467   versioning for the first DW.  This ensures that the loop count is
468   correct and the first DW (shifted) is in the expected register pair.  */
469	.align	4
470L(DW):
471	std	rWORD8, rWORD8SAVE(r1)
472	std	rWORD7, rWORD7SAVE(r1)
473	std	rOFF8, rOFF8SAVE(r1)
474	std	rOFF16, rOFF16SAVE(r1)
475	std	rOFF24, rOFF24SAVE(r1)
476	std	rOFF32, rOFF32SAVE(r1)
477	cfi_offset(rWORD8, rWORD8SAVE)
478	cfi_offset(rWORD7, rWORD7SAVE)
479	cfi_offset(rOFF8, rOFF8SAVE)
480	cfi_offset(rOFF16, rOFF16SAVE)
481	cfi_offset(rOFF24, rOFF24SAVE)
482	cfi_offset(rOFF32, rOFF32SAVE)
483
484	li	rOFF8,8
485	li	rOFF16,16
486	li	rOFF24,24
487	li	rOFF32,32
488	clrrdi	rSTR1, rSTR1, 3
489	clrrdi	rSTR2, rSTR2, 3
490	beq	cr5, L(DWaligned)
491	add	rN, rN, r12
492	sldi	rWORD6, r12, 3
493	srdi	r0, rN, 5	/* Divide by 32.  */
494	andi.	r12, rN, 24	/* Get the DW remainder.  */
495	LD	rWORD1, 0, rSTR1
496	LD	rWORD2, 0, rSTR2
497	cmpldi	cr1, r12, 16
498	cmpldi	cr7, rN, 32
499	clrldi	rN, rN, 61
500	beq	L(dPs4)
501	mtctr	r0
502	bgt	cr1, L(dPs3)
503	beq	cr1, L(dPs2)
504
505/* Remainder is 8.  */
506	.align	3
507L(dsP1):
508	sld	rWORD5, rWORD1, rWORD6
509	sld	rWORD6, rWORD2, rWORD6
510	cmpld	cr5, rWORD5, rWORD6
511	blt	cr7, L(dP1x)
512/* Do something useful in this cycle since we have to branch anyway.  */
513	LD	rWORD1, rOFF8, rSTR1
514	LD	rWORD2, rOFF8, rSTR2
515	cmpld	cr7, rWORD1, rWORD2
516	b	L(dP1e)
517/* Remainder is 16.  */
518	.align	4
519L(dPs2):
520	sld	rWORD5, rWORD1, rWORD6
521	sld	rWORD6, rWORD2, rWORD6
522	cmpld	cr6, rWORD5, rWORD6
523	blt	cr7, L(dP2x)
524/* Do something useful in this cycle since we have to branch anyway.  */
525	LD	rWORD7, rOFF8, rSTR1
526	LD	rWORD8, rOFF8, rSTR2
527	cmpld	cr5, rWORD7, rWORD8
528	b	L(dP2e)
529/* Remainder is 24.  */
530	.align	4
531L(dPs3):
532	sld	rWORD3, rWORD1, rWORD6
533	sld	rWORD4, rWORD2, rWORD6
534	cmpld	cr1, rWORD3, rWORD4
535	b	L(dP3e)
536/* Count is a multiple of 32, remainder is 0.  */
537	.align	4
538L(dPs4):
539	mtctr	r0
540	sld	rWORD1, rWORD1, rWORD6
541	sld	rWORD2, rWORD2, rWORD6
542	cmpld	cr7, rWORD1, rWORD2
543	b	L(dP4e)
544
545/* At this point we know both strings are double word aligned and the
546   compare length is at least 8 bytes.  */
547	.align	4
548L(DWaligned):
549	andi.	r12, rN, 24	/* Get the DW remainder.  */
550	srdi	r0, rN, 5	/* Divide by 32.  */
551	cmpldi	cr1, r12, 16
552	cmpldi	cr7, rN, 32
553	clrldi	rN, rN, 61
554	beq	L(dP4)
555	bgt	cr1, L(dP3)
556	beq	cr1, L(dP2)
557
558/* Remainder is 8.  */
559	.align	4
560L(dP1):
561	mtctr	r0
562/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
563   (8-15 byte compare), we want to use only volatile registers.  This
564   means we can avoid restoring non-volatile registers since we did not
565   change any on the early exit path.  The key here is the non-early
566   exit path only cares about the condition code (cr5), not about which
567   register pair was used.  */
568	LD	rWORD5, 0, rSTR1
569	LD	rWORD6, 0, rSTR2
570	cmpld	cr5, rWORD5, rWORD6
571	blt	cr7, L(dP1x)
572	LD	rWORD1, rOFF8, rSTR1
573	LD	rWORD2, rOFF8, rSTR2
574	cmpld	cr7, rWORD1, rWORD2
575L(dP1e):
576	LD	rWORD3, rOFF16, rSTR1
577	LD	rWORD4, rOFF16, rSTR2
578	cmpld	cr1, rWORD3, rWORD4
579	LD	rWORD5, rOFF24, rSTR1
580	LD	rWORD6, rOFF24, rSTR2
581	cmpld	cr6, rWORD5, rWORD6
582	bne	cr5, L(dLcr5x)
583	bne	cr7, L(dLcr7x)
584
585	LD	rWORD7, rOFF32, rSTR1
586	LD	rWORD8, rOFF32, rSTR2
587	addi	rSTR1, rSTR1, 32
588	addi	rSTR2, rSTR2, 32
589	bne	cr1, L(dLcr1)
590	cmpld	cr5, rWORD7, rWORD8
591	bdnz	L(dLoop)
592	bne	cr6, L(dLcr6)
593	ld	rWORD8, rWORD8SAVE(r1)
594	ld	rWORD7, rWORD7SAVE(r1)
595	.align	3
596L(dP1x):
597	sldi.	r12, rN, 3
598	bne	cr5, L(dLcr5x)
599	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
600	bne	L(d00)
601	ld	rOFF8,  rOFF8SAVE(r1)
602	ld	rOFF16, rOFF16SAVE(r1)
603	ld	rOFF24, rOFF24SAVE(r1)
604	ld	rOFF32, rOFF32SAVE(r1)
605	li	rRTN, 0
606	blr
607
608/* Remainder is 16.  */
609	.align	4
610L(dP2):
611	mtctr	r0
612	LD	rWORD5, 0, rSTR1
613	LD	rWORD6, 0, rSTR2
614	cmpld	cr6, rWORD5, rWORD6
615	blt	cr7, L(dP2x)
616	LD	rWORD7, rOFF8, rSTR1
617	LD	rWORD8, rOFF8, rSTR2
618	cmpld	cr5, rWORD7, rWORD8
619L(dP2e):
620	LD	rWORD1, rOFF16, rSTR1
621	LD	rWORD2, rOFF16, rSTR2
622	cmpld	cr7, rWORD1, rWORD2
623	LD	rWORD3, rOFF24, rSTR1
624	LD	rWORD4, rOFF24, rSTR2
625	cmpld	cr1, rWORD3, rWORD4
626	addi	rSTR1, rSTR1, 8
627	addi	rSTR2, rSTR2, 8
628	bne	cr6, L(dLcr6)
629	bne	cr5, L(dLcr5)
630	b	L(dLoop2)
631	.align	4
632L(dP2x):
633	LD	rWORD3, rOFF8, rSTR1
634	LD	rWORD4, rOFF8, rSTR2
635	cmpld	cr1, rWORD3, rWORD4
636	sldi.	r12, rN, 3
637	bne	cr6, L(dLcr6x)
638	addi	rSTR1, rSTR1, 8
639	addi	rSTR2, rSTR2, 8
640	bne	cr1, L(dLcr1x)
641	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
642	bne	L(d00)
643	ld	rOFF8,  rOFF8SAVE(r1)
644	ld	rOFF16, rOFF16SAVE(r1)
645	ld	rOFF24, rOFF24SAVE(r1)
646	ld	rOFF32, rOFF32SAVE(r1)
647	li	rRTN, 0
648	blr
649
650/* Remainder is 24.  */
651	.align	4
652L(dP3):
653	mtctr	r0
654	LD	rWORD3, 0, rSTR1
655	LD	rWORD4, 0, rSTR2
656	cmpld	cr1, rWORD3, rWORD4
657L(dP3e):
658	LD	rWORD5, rOFF8, rSTR1
659	LD	rWORD6, rOFF8, rSTR2
660	cmpld	cr6, rWORD5, rWORD6
661	blt	cr7, L(dP3x)
662	LD	rWORD7, rOFF16, rSTR1
663	LD	rWORD8, rOFF16, rSTR2
664	cmpld	cr5, rWORD7, rWORD8
665	LD	rWORD1, rOFF24, rSTR1
666	LD	rWORD2, rOFF24, rSTR2
667	cmpld	cr7, rWORD1, rWORD2
668	addi	rSTR1, rSTR1, 16
669	addi	rSTR2, rSTR2, 16
670	bne	cr1, L(dLcr1)
671	bne	cr6, L(dLcr6)
672	b	L(dLoop1)
673/* Again we are on a early exit path (24-31 byte compare), we want to
674   only use volatile registers and avoid restoring non-volatile
675   registers.  */
676	.align	4
677L(dP3x):
678	LD	rWORD1, rOFF16, rSTR1
679	LD	rWORD2, rOFF16, rSTR2
680	cmpld	cr7, rWORD1, rWORD2
681	sldi.	r12, rN, 3
682	bne	cr1, L(dLcr1x)
683	addi	rSTR1, rSTR1, 16
684	addi	rSTR2, rSTR2, 16
685	bne	cr6, L(dLcr6x)
686	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
687	bne	cr7, L(dLcr7x)
688	bne	L(d00)
689	ld	rOFF8,  rOFF8SAVE(r1)
690	ld	rOFF16, rOFF16SAVE(r1)
691	ld	rOFF24, rOFF24SAVE(r1)
692	ld	rOFF32, rOFF32SAVE(r1)
693	li	rRTN, 0
694	blr
695
696/* Count is a multiple of 32, remainder is 0.  */
697	.align	4
698L(dP4):
699	mtctr	r0
700	LD	rWORD1, 0, rSTR1
701	LD	rWORD2, 0, rSTR2
702	cmpld	cr7, rWORD1, rWORD2
703L(dP4e):
704	LD	rWORD3, rOFF8, rSTR1
705	LD	rWORD4, rOFF8, rSTR2
706	cmpld	cr1, rWORD3, rWORD4
707	LD	rWORD5, rOFF16, rSTR1
708	LD	rWORD6, rOFF16, rSTR2
709	cmpld	cr6, rWORD5, rWORD6
710	LD	rWORD7, rOFF24, rSTR1
711	LD	rWORD8, rOFF24, rSTR2
712	addi	rSTR1, rSTR1, 24
713	addi	rSTR2, rSTR2, 24
714	cmpld	cr5, rWORD7, rWORD8
715	bne	cr7, L(dLcr7)
716	bne	cr1, L(dLcr1)
717	bdz-	L(d24)		/* Adjust CTR as we start with +4.  */
718/* This is the primary loop.  */
719	.align	4
720L(dLoop):
721	LD	rWORD1, rOFF8, rSTR1
722	LD	rWORD2, rOFF8, rSTR2
723	cmpld	cr1, rWORD3, rWORD4
724	bne	cr6, L(dLcr6)
725L(dLoop1):
726	LD	rWORD3, rOFF16, rSTR1
727	LD	rWORD4, rOFF16, rSTR2
728	cmpld	cr6, rWORD5, rWORD6
729	bne	cr5, L(dLcr5)
730L(dLoop2):
731	LD	rWORD5, rOFF24, rSTR1
732	LD	rWORD6, rOFF24, rSTR2
733	cmpld	cr5, rWORD7, rWORD8
734	bne	cr7, L(dLcr7)
735L(dLoop3):
736	LD	rWORD7, rOFF32, rSTR1
737	LD	rWORD8, rOFF32, rSTR2
738	addi	rSTR1, rSTR1, 32
739	addi	rSTR2, rSTR2, 32
740	bne	cr1, L(dLcr1)
741	cmpld	cr7, rWORD1, rWORD2
742	bdnz	L(dLoop)
743
744L(dL4):
745	cmpld	cr1, rWORD3, rWORD4
746	bne	cr6, L(dLcr6)
747	cmpld	cr6, rWORD5, rWORD6
748	bne	cr5, L(dLcr5)
749	cmpld	cr5, rWORD7, rWORD8
750L(d44):
751	bne	cr7, L(dLcr7)
752L(d34):
753	bne	cr1, L(dLcr1)
754L(d24):
755	bne	cr6, L(dLcr6)
756L(d14):
757	sldi.	r12, rN, 3
758	bne	cr5, L(dLcr5)
759L(d04):
760	ld	rWORD8, rWORD8SAVE(r1)
761	ld	rWORD7, rWORD7SAVE(r1)
762	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
763	beq	L(duzeroLength)
764/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
765   we are aligned it is safe to load the whole double word, and use
766   shift right double to eliminate bits beyond the compare length.  */
767L(d00):
768	LD	rWORD1, rOFF8, rSTR1
769	LD	rWORD2, rOFF8, rSTR2
770	srd	rWORD1, rWORD1, rN
771	srd	rWORD2, rWORD2, rN
772	cmpld	cr7, rWORD1, rWORD2
773	bne	cr7, L(dLcr7x)
774	ld	rOFF8,  rOFF8SAVE(r1)
775	ld	rOFF16, rOFF16SAVE(r1)
776	ld	rOFF24, rOFF24SAVE(r1)
777	ld	rOFF32, rOFF32SAVE(r1)
778	li	rRTN, 0
779	blr
780
781	.align	4
782L(dLcr7):
783	ld	rWORD8, rWORD8SAVE(r1)
784	ld	rWORD7, rWORD7SAVE(r1)
785L(dLcr7x):
786	ld	rOFF8,  rOFF8SAVE(r1)
787	ld	rOFF16, rOFF16SAVE(r1)
788	ld	rOFF24, rOFF24SAVE(r1)
789	ld	rOFF32, rOFF32SAVE(r1)
790	li	rRTN, 1
791	bgtlr	cr7
792	li	rRTN, -1
793	blr
794	.align	4
795L(dLcr1):
796	ld	rWORD8, rWORD8SAVE(r1)
797	ld	rWORD7, rWORD7SAVE(r1)
798L(dLcr1x):
799	ld	rOFF8,  rOFF8SAVE(r1)
800	ld	rOFF16, rOFF16SAVE(r1)
801	ld	rOFF24, rOFF24SAVE(r1)
802	ld	rOFF32, rOFF32SAVE(r1)
803	li	rRTN, 1
804	bgtlr	cr1
805	li	rRTN, -1
806	blr
807	.align	4
808L(dLcr6):
809	ld	rWORD8, rWORD8SAVE(r1)
810	ld	rWORD7, rWORD7SAVE(r1)
811L(dLcr6x):
812	ld	rOFF8,  rOFF8SAVE(r1)
813	ld	rOFF16, rOFF16SAVE(r1)
814	ld	rOFF24, rOFF24SAVE(r1)
815	ld	rOFF32, rOFF32SAVE(r1)
816	li	rRTN, 1
817	bgtlr	cr6
818	li	rRTN, -1
819	blr
820	.align	4
821L(dLcr5):
822	ld	rWORD8, rWORD8SAVE(r1)
823	ld	rWORD7, rWORD7SAVE(r1)
824L(dLcr5x):
825	ld	rOFF8,  rOFF8SAVE(r1)
826	ld	rOFF16, rOFF16SAVE(r1)
827	ld	rOFF24, rOFF24SAVE(r1)
828	ld	rOFF32, rOFF32SAVE(r1)
829	li	rRTN, 1
830	bgtlr	cr5
831	li	rRTN, -1
832	blr
833
834	.align	4
835L(bytealigned):
836	mtctr	rN
837
838/* We need to prime this loop.  This loop is swing modulo scheduled
839   to avoid pipe delays.  The dependent instruction latencies (load to
840   compare to conditional branch) is 2 to 3 cycles.  In this loop each
841   dispatch group ends in a branch and takes 1 cycle.  Effectively
842   the first iteration of the loop only serves to load operands and
843   branches based on compares are delayed until the next loop.
844
845   So we must precondition some registers and condition codes so that
846   we don't exit the loop early on the first iteration.  */
847
848	lbz	rWORD1, 0(rSTR1)
849	lbz	rWORD2, 0(rSTR2)
850	bdz	L(b11)
851	cmpld	cr7, rWORD1, rWORD2
852	lbz	rWORD3, 1(rSTR1)
853	lbz	rWORD4, 1(rSTR2)
854	bdz	L(b12)
855	cmpld	cr1, rWORD3, rWORD4
856	lbzu	rWORD5, 2(rSTR1)
857	lbzu	rWORD6, 2(rSTR2)
858	bdz	L(b13)
859	.align	4
860L(bLoop):
861	lbzu	rWORD1, 1(rSTR1)
862	lbzu	rWORD2, 1(rSTR2)
863	bne	cr7, L(bLcr7)
864
865	cmpld	cr6, rWORD5, rWORD6
866	bdz	L(b3i)
867
868	lbzu	rWORD3, 1(rSTR1)
869	lbzu	rWORD4, 1(rSTR2)
870	bne	cr1, L(bLcr1)
871
872	cmpld	cr7, rWORD1, rWORD2
873	bdz	L(b2i)
874
875	lbzu	rWORD5, 1(rSTR1)
876	lbzu	rWORD6, 1(rSTR2)
877	bne	cr6, L(bLcr6)
878
879	cmpld	cr1, rWORD3, rWORD4
880	bdnz	L(bLoop)
881
882/* We speculatively loading bytes before we have tested the previous
883   bytes.  But we must avoid overrunning the length (in the ctr) to
884   prevent these speculative loads from causing a segfault.  In this
885   case the loop will exit early (before the all pending bytes are
886   tested.  In this case we must complete the pending operations
887   before returning.  */
888L(b1i):
889	bne	cr7, L(bLcr7)
890	bne	cr1, L(bLcr1)
891	b	L(bx56)
892	.align	4
893L(b2i):
894	bne	cr6, L(bLcr6)
895	bne	cr7, L(bLcr7)
896	b	L(bx34)
897	.align	4
898L(b3i):
899	bne	cr1, L(bLcr1)
900	bne	cr6, L(bLcr6)
901	b	L(bx12)
902	.align	4
903L(bLcr7):
904	li	rRTN, 1
905	bgtlr	cr7
906	li	rRTN, -1
907	blr
908L(bLcr1):
909	li	rRTN, 1
910	bgtlr	cr1
911	li	rRTN, -1
912	blr
913L(bLcr6):
914	li	rRTN, 1
915	bgtlr	cr6
916	li	rRTN, -1
917	blr
918
919L(b13):
920	bne	cr7, L(bx12)
921	bne	cr1, L(bx34)
922L(bx56):
923	sub	rRTN, rWORD5, rWORD6
924	blr
925	nop
926L(b12):
927	bne	cr7, L(bx12)
928L(bx34):
929	sub	rRTN, rWORD3, rWORD4
930	blr
931L(b11):
932L(bx12):
933	sub	rRTN, rWORD1, rWORD2
934	blr
935
936	.align	4
937L(zeroLength):
938	li	rRTN, 0
939	blr
940
941	.align	4
942/* At this point we know the strings have different alignment and the
943   compare length is at least 8 bytes.  r12 contains the low order
944   3 bits of rSTR1 and cr5 contains the result of the logical compare
945   of r12 to 0.  If r12 == 0 then rStr1 is double word
946   aligned and can perform the DWunaligned loop.
947
948   Otherwise we know that rSTR1 is not already DW aligned yet.
949   So we can force the string addresses to the next lower DW
950   boundary and special case this first DW using shift left to
951   eliminate bits preceding the first byte.  Since we want to join the
952   normal (DWaligned) compare loop, starting at the second double word,
953   we need to adjust the length (rN) and special case the loop
954   versioning for the first DW.  This ensures that the loop count is
955   correct and the first DW (shifted) is in the expected resister pair.  */
956L(unaligned):
957	std	rWORD8, rWORD8SAVE(r1)
958	std	rWORD7, rWORD7SAVE(r1)
959	std	rOFF8, rOFF8SAVE(r1)
960	std	rOFF16, rOFF16SAVE(r1)
961	std	rOFF24, rOFF24SAVE(r1)
962	std	rOFF32, rOFF32SAVE(r1)
963	cfi_offset(rWORD8, rWORD8SAVE)
964	cfi_offset(rWORD7, rWORD7SAVE)
965	cfi_offset(rOFF8, rOFF8SAVE)
966	cfi_offset(rOFF16, rOFF16SAVE)
967	cfi_offset(rOFF24, rOFF24SAVE)
968	cfi_offset(rOFF32, rOFF32SAVE)
969	li	rOFF8,8
970	li	rOFF16,16
971	li	rOFF24,24
972	li	rOFF32,32
973	std	rSHL, rSHLSAVE(r1)
974	cfi_offset(rSHL, rSHLSAVE)
975	clrldi	rSHL, rSTR2, 61
976	beq	cr6, L(duzeroLength)
977	std	rSHR, rSHRSAVE(r1)
978	cfi_offset(rSHR, rSHRSAVE)
979	beq	cr5, L(DWunaligned)
980	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
981	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
982/* Adjust the logical start of rSTR2 to compensate for the extra bits
983   in the 1st rSTR1 DW.  */
984	sub	rWORD8_SHIFT, rSTR2, r12
985/* But do not attempt to address the DW before that DW that contains
986   the actual start of rSTR2.  */
987	clrrdi	rSTR2, rSTR2, 3
988	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
989/* Compute the left/right shift counts for the unaligned rSTR2,
990   compensating for the logical (DW aligned) start of rSTR1.  */
991	clrldi	rSHL, rWORD8_SHIFT, 61
992	clrrdi	rSTR1, rSTR1, 3
993	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
994	sldi	rSHL, rSHL, 3
995	cmpld	cr5, rWORD8_SHIFT, rSTR2
996	add	rN, rN, r12
997	sldi	rWORD6, r12, 3
998	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
999	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1000	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1001	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1002	subfic	rSHR, rSHL, 64
1003	srdi	r0, rN, 5	/* Divide by 32.  */
1004	andi.	r12, rN, 24	/* Get the DW remainder.  */
1005/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
1006   this special case those bits may be discarded anyway.  Also we
1007   must avoid loading a DW where none of the bits are part of rSTR2 as
1008   this may cross a page boundary and cause a page fault.  */
1009	li	rWORD8, 0
1010	blt	cr5, L(dus0)
1011	LD	rWORD8, 0, rSTR2
1012	addi	rSTR2, rSTR2, 8
1013	sld	rWORD8, rWORD8, rSHL
1014
1015L(dus0):
1016	LD	rWORD1, 0, rSTR1
1017	LD	rWORD2, 0, rSTR2
1018	cmpldi	cr1, r12, 16
1019	cmpldi	cr7, rN, 32
1020	srd	r12, rWORD2, rSHR
1021	clrldi	rN, rN, 61
1022	beq	L(duPs4)
1023	mtctr	r0
1024	or	rWORD8, r12, rWORD8
1025	bgt	cr1, L(duPs3)
1026	beq	cr1, L(duPs2)
1027
1028/* Remainder is 8.  */
1029	.align	4
1030L(dusP1):
1031	sld	rWORD8_SHIFT, rWORD2, rSHL
1032	sld	rWORD7, rWORD1, rWORD6
1033	sld	rWORD8, rWORD8, rWORD6
1034	bge	cr7, L(duP1e)
1035/* At this point we exit early with the first double word compare
1036   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
1037   how we handle the remaining bytes.  */
1038	cmpld	cr5, rWORD7, rWORD8
1039	sldi.	rN, rN, 3
1040	bne	cr5, L(duLcr5)
1041	cmpld	cr7, rN, rSHR
1042	beq	L(duZeroReturn)
1043	li	r0, 0
1044	ble	cr7, L(dutrim)
1045	LD	rWORD2, rOFF8, rSTR2
1046	srd	r0, rWORD2, rSHR
1047	b	L(dutrim)
1048/* Remainder is 16.  */
1049	.align	4
1050L(duPs2):
1051	sld	rWORD6_SHIFT, rWORD2, rSHL
1052	sld	rWORD5, rWORD1, rWORD6
1053	sld	rWORD6, rWORD8, rWORD6
1054	b	L(duP2e)
1055/* Remainder is 24.  */
1056	.align	4
1057L(duPs3):
1058	sld	rWORD4_SHIFT, rWORD2, rSHL
1059	sld	rWORD3, rWORD1, rWORD6
1060	sld	rWORD4, rWORD8, rWORD6
1061	b	L(duP3e)
1062/* Count is a multiple of 32, remainder is 0.  */
1063	.align	4
1064L(duPs4):
1065	mtctr	r0
1066	or	rWORD8, r12, rWORD8
1067	sld	rWORD2_SHIFT, rWORD2, rSHL
1068	sld	rWORD1, rWORD1, rWORD6
1069	sld	rWORD2, rWORD8, rWORD6
1070	b	L(duP4e)
1071
1072/* At this point we know rSTR1 is double word aligned and the
1073   compare length is at least 8 bytes.  */
1074	.align	4
1075L(DWunaligned):
1076	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1077	clrrdi	rSTR2, rSTR2, 3
1078	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1079	srdi	r0, rN, 5	/* Divide by 32.  */
1080	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1081	andi.	r12, rN, 24	/* Get the DW remainder.  */
1082	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1083	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
1084	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1085	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1086	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1087	sldi	rSHL, rSHL, 3
1088	LD	rWORD6, 0, rSTR2
1089	LD	rWORD8, rOFF8, rSTR2
1090	addi	rSTR2, rSTR2, 8
1091	cmpldi	cr1, r12, 16
1092	cmpldi	cr7, rN, 32
1093	clrldi	rN, rN, 61
1094	subfic	rSHR, rSHL, 64
1095	sld	rWORD6_SHIFT, rWORD6, rSHL
1096	beq	L(duP4)
1097	mtctr	r0
1098	bgt	cr1, L(duP3)
1099	beq	cr1, L(duP2)
1100
1101/* Remainder is 8.  */
1102	.align	4
1103L(duP1):
1104	srd	r12, rWORD8, rSHR
1105	LD	rWORD7, 0, rSTR1
1106	sld	rWORD8_SHIFT, rWORD8, rSHL
1107	or	rWORD8, r12, rWORD6_SHIFT
1108	blt	cr7, L(duP1x)
1109L(duP1e):
1110	LD	rWORD1, rOFF8, rSTR1
1111	LD	rWORD2, rOFF8, rSTR2
1112	cmpld	cr5, rWORD7, rWORD8
1113	srd	r0, rWORD2, rSHR
1114	sld	rWORD2_SHIFT, rWORD2, rSHL
1115	or	rWORD2, r0, rWORD8_SHIFT
1116	LD	rWORD3, rOFF16, rSTR1
1117	LD	rWORD4, rOFF16, rSTR2
1118	cmpld	cr7, rWORD1, rWORD2
1119	srd	r12, rWORD4, rSHR
1120	sld	rWORD4_SHIFT, rWORD4, rSHL
1121	bne	cr5, L(duLcr5)
1122	or	rWORD4, r12, rWORD2_SHIFT
1123	LD	rWORD5, rOFF24, rSTR1
1124	LD	rWORD6, rOFF24, rSTR2
1125	cmpld	cr1, rWORD3, rWORD4
1126	srd	r0, rWORD6, rSHR
1127	sld	rWORD6_SHIFT, rWORD6, rSHL
1128	bne	cr7, L(duLcr7)
1129	or	rWORD6, r0, rWORD4_SHIFT
1130	cmpld	cr6, rWORD5, rWORD6
1131	b	L(duLoop3)
1132	.align	4
1133/* At this point we exit early with the first double word compare
1134   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
1135   how we handle the remaining bytes.  */
1136L(duP1x):
1137	cmpld	cr5, rWORD7, rWORD8
1138	sldi.	rN, rN, 3
1139	bne	cr5, L(duLcr5)
1140	cmpld	cr7, rN, rSHR
1141	beq	L(duZeroReturn)
1142	li	r0, 0
1143	ble	cr7, L(dutrim)
1144	LD	rWORD2, rOFF8, rSTR2
1145	srd	r0, rWORD2, rSHR
1146	b	L(dutrim)
1147/* Remainder is 16.  */
1148	.align	4
1149L(duP2):
1150	srd	r0, rWORD8, rSHR
1151	LD	rWORD5, 0, rSTR1
1152	or	rWORD6, r0, rWORD6_SHIFT
1153	sld	rWORD6_SHIFT, rWORD8, rSHL
1154L(duP2e):
1155	LD	rWORD7, rOFF8, rSTR1
1156	LD	rWORD8, rOFF8, rSTR2
1157	cmpld	cr6, rWORD5, rWORD6
1158	srd	r12, rWORD8, rSHR
1159	sld	rWORD8_SHIFT, rWORD8, rSHL
1160	or	rWORD8, r12, rWORD6_SHIFT
1161	blt	cr7, L(duP2x)
1162	LD	rWORD1, rOFF16, rSTR1
1163	LD	rWORD2, rOFF16, rSTR2
1164	cmpld	cr5, rWORD7, rWORD8
1165	bne	cr6, L(duLcr6)
1166	srd	r0, rWORD2, rSHR
1167	sld	rWORD2_SHIFT, rWORD2, rSHL
1168	or	rWORD2, r0, rWORD8_SHIFT
1169	LD	rWORD3, rOFF24, rSTR1
1170	LD	rWORD4, rOFF24, rSTR2
1171	cmpld	cr7, rWORD1, rWORD2
1172	bne	cr5, L(duLcr5)
1173	srd	r12, rWORD4, rSHR
1174	sld	rWORD4_SHIFT, rWORD4, rSHL
1175	or	rWORD4, r12, rWORD2_SHIFT
1176	addi	rSTR1, rSTR1, 8
1177	addi	rSTR2, rSTR2, 8
1178	cmpld	cr1, rWORD3, rWORD4
1179	b	L(duLoop2)
1180	.align	4
1181L(duP2x):
1182	cmpld	cr5, rWORD7, rWORD8
1183	addi	rSTR1, rSTR1, 8
1184	addi	rSTR2, rSTR2, 8
1185	bne	cr6, L(duLcr6)
1186	sldi.	rN, rN, 3
1187	bne	cr5, L(duLcr5)
1188	cmpld	cr7, rN, rSHR
1189	beq	L(duZeroReturn)
1190	li	r0, 0
1191	ble	cr7, L(dutrim)
1192	LD	rWORD2, rOFF8, rSTR2
1193	srd	r0, rWORD2, rSHR
1194	b	L(dutrim)
1195
1196/* Remainder is 24.  */
1197	.align	4
1198L(duP3):
1199	srd	r12, rWORD8, rSHR
1200	LD	rWORD3, 0, rSTR1
1201	sld	rWORD4_SHIFT, rWORD8, rSHL
1202	or	rWORD4, r12, rWORD6_SHIFT
1203L(duP3e):
1204	LD	rWORD5, rOFF8, rSTR1
1205	LD	rWORD6, rOFF8, rSTR2
1206	cmpld	cr1, rWORD3, rWORD4
1207	srd	r0, rWORD6, rSHR
1208	sld	rWORD6_SHIFT, rWORD6, rSHL
1209	or	rWORD6, r0, rWORD4_SHIFT
1210	LD	rWORD7, rOFF16, rSTR1
1211	LD	rWORD8, rOFF16, rSTR2
1212	cmpld	cr6, rWORD5, rWORD6
1213	bne	cr1, L(duLcr1)
1214	srd	r12, rWORD8, rSHR
1215	sld	rWORD8_SHIFT, rWORD8, rSHL
1216	or	rWORD8, r12, rWORD6_SHIFT
1217	blt	cr7, L(duP3x)
1218	LD	rWORD1, rOFF24, rSTR1
1219	LD	rWORD2, rOFF24, rSTR2
1220	cmpld	cr5, rWORD7, rWORD8
1221	bne	cr6, L(duLcr6)
1222	srd	r0, rWORD2, rSHR
1223	sld	rWORD2_SHIFT, rWORD2, rSHL
1224	or	rWORD2, r0, rWORD8_SHIFT
1225	addi	rSTR1, rSTR1, 16
1226	addi	rSTR2, rSTR2, 16
1227	cmpld	cr7, rWORD1, rWORD2
1228	b	L(duLoop1)
1229	.align	4
1230L(duP3x):
1231	addi	rSTR1, rSTR1, 16
1232	addi	rSTR2, rSTR2, 16
1233	cmpld	cr5, rWORD7, rWORD8
1234	bne	cr6, L(duLcr6)
1235	sldi.	rN, rN, 3
1236	bne	cr5, L(duLcr5)
1237	cmpld	cr7, rN, rSHR
1238	beq	L(duZeroReturn)
1239	li	r0, 0
1240	ble	cr7, L(dutrim)
1241	LD	rWORD2, rOFF8, rSTR2
1242	srd	r0, rWORD2, rSHR
1243	b	L(dutrim)
1244
1245/* Count is a multiple of 32, remainder is 0.  */
1246	.align	4
1247L(duP4):
1248	mtctr	r0
1249	srd	r0, rWORD8, rSHR
1250	LD	rWORD1, 0, rSTR1
1251	sld	rWORD2_SHIFT, rWORD8, rSHL
1252	or	rWORD2, r0, rWORD6_SHIFT
1253L(duP4e):
1254	LD	rWORD3, rOFF8, rSTR1
1255	LD	rWORD4, rOFF8, rSTR2
1256	cmpld	cr7, rWORD1, rWORD2
1257	srd	r12, rWORD4, rSHR
1258	sld	rWORD4_SHIFT, rWORD4, rSHL
1259	or	rWORD4, r12, rWORD2_SHIFT
1260	LD	rWORD5, rOFF16, rSTR1
1261	LD	rWORD6, rOFF16, rSTR2
1262	cmpld	cr1, rWORD3, rWORD4
1263	bne	cr7, L(duLcr7)
1264	srd	r0, rWORD6, rSHR
1265	sld	rWORD6_SHIFT, rWORD6, rSHL
1266	or	rWORD6, r0, rWORD4_SHIFT
1267	LD	rWORD7, rOFF24, rSTR1
1268	LD	rWORD8, rOFF24, rSTR2
1269	addi	rSTR1, rSTR1, 24
1270	addi	rSTR2, rSTR2, 24
1271	cmpld	cr6, rWORD5, rWORD6
1272	bne	cr1, L(duLcr1)
1273	srd	r12, rWORD8, rSHR
1274	sld	rWORD8_SHIFT, rWORD8, rSHL
1275	or	rWORD8, r12, rWORD6_SHIFT
1276	cmpld	cr5, rWORD7, rWORD8
1277	bdz	L(du24)		/* Adjust CTR as we start with +4.  */
1278/* This is the primary loop.  */
1279	.align	4
1280L(duLoop):
1281	LD	rWORD1, rOFF8, rSTR1
1282	LD	rWORD2, rOFF8, rSTR2
1283	cmpld	cr1, rWORD3, rWORD4
1284	bne	cr6, L(duLcr6)
1285	srd	r0, rWORD2, rSHR
1286	sld	rWORD2_SHIFT, rWORD2, rSHL
1287	or	rWORD2, r0, rWORD8_SHIFT
1288L(duLoop1):
1289	LD	rWORD3, rOFF16, rSTR1
1290	LD	rWORD4, rOFF16, rSTR2
1291	cmpld	cr6, rWORD5, rWORD6
1292	bne	cr5, L(duLcr5)
1293	srd	r12, rWORD4, rSHR
1294	sld	rWORD4_SHIFT, rWORD4, rSHL
1295	or	rWORD4, r12, rWORD2_SHIFT
1296L(duLoop2):
1297	LD	rWORD5, rOFF24, rSTR1
1298	LD	rWORD6, rOFF24, rSTR2
1299	cmpld	cr5, rWORD7, rWORD8
1300	bne	cr7, L(duLcr7)
1301	srd	r0, rWORD6, rSHR
1302	sld	rWORD6_SHIFT, rWORD6, rSHL
1303	or	rWORD6, r0, rWORD4_SHIFT
1304L(duLoop3):
1305	LD	rWORD7, rOFF32, rSTR1
1306	LD	rWORD8, rOFF32, rSTR2
1307	addi	rSTR1, rSTR1, 32
1308	addi	rSTR2, rSTR2, 32
1309	cmpld	cr7, rWORD1, rWORD2
1310	bne	cr1, L(duLcr1)
1311	srd	r12, rWORD8, rSHR
1312	sld	rWORD8_SHIFT, rWORD8, rSHL
1313	or	rWORD8, r12, rWORD6_SHIFT
1314	bdnz	L(duLoop)
1315
1316L(duL4):
1317	cmpld	cr1, rWORD3, rWORD4
1318	bne	cr6, L(duLcr6)
1319	cmpld	cr6, rWORD5, rWORD6
1320	bne	cr5, L(duLcr5)
1321	cmpld	cr5, rWORD7, rWORD8
1322L(du44):
1323	bne	cr7, L(duLcr7)
1324L(du34):
1325	bne	cr1, L(duLcr1)
1326L(du24):
1327	bne	cr6, L(duLcr6)
1328L(du14):
1329	sldi.	rN, rN, 3
1330	bne	cr5, L(duLcr5)
1331/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
1332   shift right double to eliminate bits beyond the compare length.
1333
1334   However it may not be safe to load rWORD2 which may be beyond the
1335   string length.  So we compare the bit length of the remainder to
1336   the right shift count (rSHR).  If the bit count is less than or equal
1337   we do not need to load rWORD2 (all significant bits are already in
1338   rWORD8_SHIFT).  */
1339	cmpld	cr7, rN, rSHR
1340	beq	L(duZeroReturn)
1341	li	r0, 0
1342	ble	cr7, L(dutrim)
1343	LD	rWORD2, rOFF8, rSTR2
1344	srd	r0, rWORD2, rSHR
1345	.align	4
1346L(dutrim):
1347	LD	rWORD1, rOFF8, rSTR1
1348	ld	rWORD8, -8(r1)
1349	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
1350	or	rWORD2, r0, rWORD8_SHIFT
1351	ld	rWORD7, rWORD7SAVE(r1)
1352	ld	rSHL, rSHLSAVE(r1)
1353	srd	rWORD1, rWORD1, rN
1354	srd	rWORD2, rWORD2, rN
1355	ld	rSHR, rSHRSAVE(r1)
1356	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1357	li	rRTN, 0
1358	cmpld	cr7, rWORD1, rWORD2
1359	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1360	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1361	beq	cr7, L(dureturn24)
1362	li	rRTN, 1
1363	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1364	ld	rOFF8,  rOFF8SAVE(r1)
1365	ld	rOFF16, rOFF16SAVE(r1)
1366	ld	rOFF24, rOFF24SAVE(r1)
1367	ld	rOFF32, rOFF32SAVE(r1)
1368	bgtlr	cr7
1369	li	rRTN, -1
1370	blr
1371	.align	4
1372L(duLcr7):
1373	ld	rWORD8, rWORD8SAVE(r1)
1374	ld	rWORD7, rWORD7SAVE(r1)
1375	li	rRTN, 1
1376	bgt	cr7, L(dureturn29)
1377	ld	rSHL, rSHLSAVE(r1)
1378	ld	rSHR, rSHRSAVE(r1)
1379	li	rRTN, -1
1380	b	L(dureturn27)
1381	.align	4
1382L(duLcr1):
1383	ld	rWORD8, rWORD8SAVE(r1)
1384	ld	rWORD7, rWORD7SAVE(r1)
1385	li	rRTN, 1
1386	bgt	cr1, L(dureturn29)
1387	ld	rSHL, rSHLSAVE(r1)
1388	ld	rSHR, rSHRSAVE(r1)
1389	li	rRTN, -1
1390	b	L(dureturn27)
1391	.align	4
1392L(duLcr6):
1393	ld	rWORD8, rWORD8SAVE(r1)
1394	ld	rWORD7, rWORD7SAVE(r1)
1395	li	rRTN, 1
1396	bgt	cr6, L(dureturn29)
1397	ld	rSHL, rSHLSAVE(r1)
1398	ld	rSHR, rSHRSAVE(r1)
1399	li	rRTN, -1
1400	b	L(dureturn27)
1401	.align	4
1402L(duLcr5):
1403	ld	rWORD8, rWORD8SAVE(r1)
1404	ld	rWORD7, rWORD7SAVE(r1)
1405	li	rRTN, 1
1406	bgt	cr5, L(dureturn29)
1407	ld	rSHL, rSHLSAVE(r1)
1408	ld	rSHR, rSHRSAVE(r1)
1409	li	rRTN, -1
1410	b	L(dureturn27)
1411
1412	.align	3
1413L(duZeroReturn):
1414	li	rRTN, 0
1415	.align	4
1416L(dureturn):
1417	ld	rWORD8, rWORD8SAVE(r1)
1418	ld	rWORD7, rWORD7SAVE(r1)
1419L(dureturn29):
1420	ld	rSHL, rSHLSAVE(r1)
1421	ld	rSHR, rSHRSAVE(r1)
1422L(dureturn27):
1423	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1424	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1425	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1426L(dureturn24):
1427	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1428	ld	rOFF8,  rOFF8SAVE(r1)
1429	ld	rOFF16, rOFF16SAVE(r1)
1430	ld	rOFF24, rOFF24SAVE(r1)
1431	ld	rOFF32, rOFF32SAVE(r1)
1432	blr
1433
1434L(duzeroLength):
1435	ld	rOFF8,  rOFF8SAVE(r1)
1436	ld	rOFF16, rOFF16SAVE(r1)
1437	ld	rOFF24, rOFF24SAVE(r1)
1438	ld	rOFF32, rOFF32SAVE(r1)
1439	li	rRTN, 0
1440	blr
1441
1442END (MEMCMP)
1443libc_hidden_builtin_def (memcmp)
1444weak_alias (memcmp, bcmp)
1445strong_alias (memcmp, __memcmpeq)
1446libc_hidden_def (__memcmpeq)
1447