1/* Optimized strcasecmp implementation for PowerPC64.
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <locale-defines.h>
21
22/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
23
24#ifndef USE_AS_STRNCASECMP
25#  define __STRCASECMP __strcasecmp
26#  define STRCASECMP   strcasecmp
27#else
28#  define __STRCASECMP __strncasecmp
29#  define STRCASECMP   strncasecmp
30#endif
31/* Convert 16 bytes to lowercase and compare */
32#define TOLOWER()     \
33	vaddubm	v8, v4, v1; \
34	vaddubm	v7, v4, v3; \
35	vcmpgtub	v8, v8, v2; \
36	vsel	v4, v7, v4, v8; \
37	vaddubm	v8, v5, v1; \
38	vaddubm	v7, v5, v3; \
39	vcmpgtub	v8, v8, v2; \
40	vsel	v5, v7, v5, v8; \
41	vcmpequb.	v7, v5, v4;
42
43/*
44 * Get 16 bytes for unaligned case.
45 * reg1: Vector to hold next 16 bytes.
46 * reg2: Address to read from.
47 * reg3: Permute control vector.
48 * v8: Tmp vector used to mask unwanted bytes.
49 * v9: Tmp vector,0 when null is found on first 16 bytes
50 */
51#ifdef __LITTLE_ENDIAN__
52#define GET16BYTES(reg1, reg2, reg3) \
53	lvx	reg1, 0, reg2; \
54	vspltisb	v8, -1; \
55	vperm	v8, v8, reg1, reg3; \
56	vcmpequb.	v8, v0, v8; \
57	beq	cr6, 1f; \
58	vspltisb	v9, 0; \
59	b	2f; \
60	.align 4; \
611: \
62	addi	r6, reg2, 16; \
63	lvx	v9, 0, r6; \
642: \
65	vperm	reg1, v9, reg1, reg3;
66#else
67#define GET16BYTES(reg1, reg2, reg3) \
68	lvx	reg1, 0, reg2; \
69	vspltisb	 v8, -1; \
70	vperm	v8, reg1, v8,  reg3; \
71	vcmpequb.	v8, v0, v8; \
72	beq	cr6, 1f; \
73	vspltisb	v9, 0; \
74	b	2f; \
75	.align 4; \
761: \
77	addi	r6, reg2, 16; \
78	lvx	v9, 0, r6; \
792: \
80	vperm	reg1, reg1, v9, reg3;
81#endif
82
83/* Check null in v4, v5 and convert to lower.  */
84#define CHECKNULLANDCONVERT() \
85	vcmpequb.	v7, v0, v5; \
86	beq	cr6, 3f; \
87	vcmpequb.	v7, v0, v4; \
88	beq	cr6, 3f; \
89	b	L(null_found); \
90	.align  4; \
913: \
92	TOLOWER()
93
94	.machine  power8
95
96ENTRY (__STRCASECMP)
97#ifdef USE_AS_STRNCASECMP
98	CALL_MCOUNT 3
99#else
100	CALL_MCOUNT 2
101#endif
102#define rRTN	r3	/* Return value */
103#define rSTR1	r10	/* 1st string */
104#define rSTR2	r4	/* 2nd string */
105#define rCHAR1	r6	/* Byte read from 1st string */
106#define rCHAR2	r7	/* Byte read from 2nd string */
107#define rADDR1	r8	/* Address of tolower(rCHAR1) */
108#define rADDR2	r12	/* Address of tolower(rCHAR2) */
109#define rLWR1	r8	/* Word tolower(rCHAR1) */
110#define rLWR2	r12	/* Word tolower(rCHAR2) */
111#define rTMP	r9
112#define rLOC	r11	/* Default locale address */
113
114	cmpd	cr7, rRTN, rSTR2
115
116	/* Get locale address.  */
117	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
118	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
119	ld	rLOC, 0(rLOC)
120
121	mr	rSTR1, rRTN
122	li	rRTN, 0
123	beqlr	cr7
124#ifdef USE_AS_STRNCASECMP
125	cmpdi	cr7, r5, 0
126	beq	cr7, L(retnull)
127	cmpdi	cr7, r5, 16
128	blt	cr7, L(bytebybyte)
129#endif
130	vspltisb	v0, 0
131	vspltisb	v8, -1
132	/* Check for null in initial characters.
133	   Check max of 16 char depending on the alignment.
134	   If null is present, proceed byte by byte.  */
135	lvx	v4, 0, rSTR1
136#ifdef  __LITTLE_ENDIAN__
137	lvsr	v10, 0, rSTR1	/* Compute mask.  */
138	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
139#else
140	lvsl	v10, 0, rSTR1
141	vperm	v9, v4, v8, v10
142#endif
143	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
144	bne	cr6, L(bytebybyte)
145	lvx	v5, 0, rSTR2
146	/* Calculate alignment.  */
147#ifdef __LITTLE_ENDIAN__
148	lvsr	v6, 0, rSTR2
149	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
150#else
151	lvsl	v6, 0, rSTR2
152	vperm	v9, v5, v8, v6
153#endif
154	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
155	bne	cr6, L(bytebybyte)
156	/* Check if locale has non ascii characters.  */
157	ld	rTMP, 0(rLOC)
158	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
159	lwz	rTMP, 0(r6)
160	cmpdi	cr7, rTMP, 1
161	beq	cr7, L(bytebybyte)
162
163	/* Load vector registers with values used for TOLOWER.  */
164	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
165	vspltisb	v3, 2
166	vspltisb	v9, 4
167	vsl	v3, v3, v9
168	vaddubm	v1, v3, v3
169	vnor	v1, v1, v1
170	vspltisb	v2, 7
171	vsububm	v2, v3, v2
172
173	andi.	rADDR1, rSTR1, 0xF
174	beq	cr0, L(align)
175	addi	r6, rSTR1, 16
176	lvx	v9, 0, r6
177	/* Compute 16 bytes from previous two loads.  */
178#ifdef __LITTLE_ENDIAN__
179	vperm	v4, v9, v4, v10
180#else
181	vperm	v4, v4, v9, v10
182#endif
183L(align):
184	andi.	rADDR2, rSTR2, 0xF
185	beq	cr0, L(align1)
186	addi	r6, rSTR2, 16
187	lvx	v9, 0, r6
188	/* Compute 16 bytes from previous two loads.  */
189#ifdef __LITTLE_ENDIAN__
190	vperm	v5, v9, v5, v6
191#else
192	vperm	v5, v5, v9, v6
193#endif
194L(align1):
195	CHECKNULLANDCONVERT()
196	blt	cr6, L(match)
197	b	L(different)
198	.align 	4
199L(match):
200	clrldi	r6, rSTR1, 60
201	subfic	r7, r6, 16
202#ifdef USE_AS_STRNCASECMP
203	sub	r5, r5, r7
204#endif
205	add	rSTR1, rSTR1, r7
206	add	rSTR2, rSTR2, r7
207	andi.	rADDR2, rSTR2, 0xF
208	addi	rSTR1, rSTR1, -16
209	addi	rSTR2, rSTR2, -16
210	beq	cr0, L(aligned)
211#ifdef __LITTLE_ENDIAN__
212	lvsr	v6, 0, rSTR2
213#else
214	lvsl	v6, 0, rSTR2
215#endif
216	/* There are 2 loops depending on the input alignment.
217	   Each loop gets 16 bytes from s1 and s2, check for null,
218	   convert to lowercase and compare. Loop till difference
219	   or null occurs. */
220L(s1_align):
221	addi	rSTR1, rSTR1, 16
222	addi	rSTR2, rSTR2, 16
223#ifdef USE_AS_STRNCASECMP
224	cmpdi	cr7, r5, 16
225	blt	cr7, L(bytebybyte)
226	addi	r5, r5, -16
227#endif
228	lvx	v4, 0, rSTR1
229	GET16BYTES(v5, rSTR2, v6)
230	CHECKNULLANDCONVERT()
231	blt	cr6, L(s1_align)
232	b	L(different)
233	.align 	4
234L(aligned):
235	addi	rSTR1, rSTR1, 16
236	addi	rSTR2, rSTR2, 16
237#ifdef USE_AS_STRNCASECMP
238	cmpdi	cr7, r5, 16
239	blt	cr7, L(bytebybyte)
240	addi	r5, r5, -16
241#endif
242	lvx	v4, 0, rSTR1
243	lvx	v5, 0, rSTR2
244	CHECKNULLANDCONVERT()
245	blt	cr6, L(aligned)
246
247	/* Calculate and return the difference. */
248L(different):
249	vaddubm	v1, v3, v3
250	vcmpequb	v7, v0, v7
251#ifdef __LITTLE_ENDIAN__
252	/* Count trailing zero.  */
253	vspltisb	v8, -1
254	vadduqm	v9, v7, v8
255	vandc	v8, v9, v7
256	vpopcntd	v8, v8
257	vspltb	v6, v8, 15
258	vcmpequb.	v6, v6, v1
259	blt	cr6, L(shift8)
260#else
261	/* Count leading zero.  */
262	vclzd	v8, v7
263	vspltb	v6, v8, 7
264	vcmpequb.	v6, v6, v1
265	blt	cr6, L(shift8)
266	vsro	v8, v8, v1
267#endif
268	b	L(skipsum)
269	.align  4
270L(shift8):
271	vsumsws		v8, v8, v0
272L(skipsum):
273#ifdef __LITTLE_ENDIAN__
274	/* Shift registers based on leading zero count.  */
275	vsro	v6, v5, v8
276	vsro	v7, v4, v8
277	/* Merge and move to GPR.  */
278	vmrglb	v6, v6, v7
279	vslo	v1, v6, v1
280	mfvrd	r3, v1
281	/* Place the characters that are different in first position.  */
282	sldi	rSTR2, rRTN, 56
283	srdi	rSTR2, rSTR2, 56
284	sldi	rSTR1, rRTN, 48
285	srdi	rSTR1, rSTR1, 56
286#else
287	vslo	v6, v5, v8
288	vslo	v7, v4, v8
289	vmrghb	v1, v6, v7
290	mfvrd	r3, v1
291	srdi	rSTR2, rRTN, 48
292	sldi	rSTR2, rSTR2, 56
293	srdi	rSTR2, rSTR2, 56
294	srdi	rSTR1, rRTN, 56
295#endif
296	subf  	rRTN, rSTR1, rSTR2
297	extsw 	rRTN, rRTN
298	blr
299
300	.align  4
301	/* OK. We've hit the end of the string. We need to be careful that
302	   we don't compare two strings as different because of junk beyond
303	   the end of the strings...  */
304L(null_found):
305	vaddubm	v10, v3, v3
306#ifdef __LITTLE_ENDIAN__
307	/* Count trailing zero.  */
308	vspltisb	v8, -1
309	vadduqm	v9, v7, v8
310	vandc	v8, v9, v7
311	vpopcntd	v8, v8
312	vspltb	v6, v8, 15
313	vcmpequb.	v6, v6, v10
314	blt	cr6, L(shift_8)
315#else
316	/* Count leading zero.  */
317	vclzd	v8, v7
318	vspltb	v6, v8, 7
319	vcmpequb.	v6, v6, v10
320	blt	cr6, L(shift_8)
321	vsro	v8, v8, v10
322#endif
323	b	L(skipsum1)
324	.align  4
325L(shift_8):
326	vsumsws	v8, v8, v0
327L(skipsum1):
328	/* Calculate shift count based on count of zero.  */
329	vspltisb	v10, 7
330	vslb	v10, v10, v10
331	vsldoi	v9, v0, v10, 1
332	vsubudm	v9, v9, v8
333	vspltisb	v8, 8
334	vsldoi	v8, v0, v8, 1
335	vsubudm	v9, v9, v8
336	/* Shift and remove junk after null character.  */
337#ifdef __LITTLE_ENDIAN__
338	vslo	v5, v5, v9
339	vslo	v4, v4, v9
340#else
341	vsro	v5, v5, v9
342	vsro	v4, v4, v9
343#endif
344	/* Convert and compare 16 bytes.  */
345	TOLOWER()
346	blt	cr6, L(retnull)
347	b	L(different)
348	.align  4
349L(retnull):
350	li	rRTN, 0
351	blr
352	.align  4
353L(bytebybyte):
354	/* Unrolling loop for POWER: loads are done with 'lbz' plus
355	offset and string descriptors are only updated in the end
356	of loop unrolling. */
357	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
358	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
359	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
360#ifdef USE_AS_STRNCASECMP
361	rldicl	rTMP, r5, 62, 2
362	cmpdi	cr7, rTMP, 0
363	beq	cr7, L(lessthan4)
364	mtctr	rTMP
365#endif
366L(loop):
367	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
368	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
369	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
370	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
371	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
372	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
373	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
374	beq	cr1, L(done)
375	lbz	rCHAR1, 1(rSTR1)
376	lbz	rCHAR2, 1(rSTR2)
377	cmpdi	rCHAR1, 0
378	sldi	rADDR1, rCHAR1, 2
379	sldi	rADDR2, rCHAR2, 2
380	lwzx	rLWR1, rLOC, rADDR1
381	lwzx	rLWR2, rLOC, rADDR2
382	cmpw	cr1, rLWR1, rLWR2
383	crorc	4*cr1+eq,eq,4*cr1+eq
384	beq	cr1, L(done)
385	lbz	rCHAR1, 2(rSTR1)
386	lbz	rCHAR2, 2(rSTR2)
387	cmpdi	rCHAR1, 0
388	sldi	rADDR1, rCHAR1, 2
389	sldi	rADDR2, rCHAR2, 2
390	lwzx	rLWR1, rLOC, rADDR1
391	lwzx	rLWR2, rLOC, rADDR2
392	cmpw	cr1, rLWR1, rLWR2
393	crorc	4*cr1+eq,eq,4*cr1+eq
394	beq	cr1, L(done)
395	lbz	rCHAR1, 3(rSTR1)
396	lbz	rCHAR2, 3(rSTR2)
397	cmpdi	rCHAR1, 0
398	/* Increment both string descriptors */
399	addi	rSTR1, rSTR1, 4
400	addi	rSTR2, rSTR2, 4
401	sldi	rADDR1, rCHAR1, 2
402	sldi	rADDR2, rCHAR2, 2
403	lwzx	rLWR1, rLOC, rADDR1
404	lwzx	rLWR2, rLOC, rADDR2
405	cmpw	cr1, rLWR1, rLWR2
406	crorc	4*cr1+eq,eq,4*cr1+eq
407	beq     cr1, L(done)
408	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
409	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
410#ifdef USE_AS_STRNCASECMP
411	bdnz	L(loop)
412#else
413	b	L(loop)
414#endif
415#ifdef USE_AS_STRNCASECMP
416L(lessthan4):
417	clrldi	r5, r5, 62
418	cmpdi	cr7, r5, 0
419	beq	cr7, L(retnull)
420	mtctr	r5
421L(loop1):
422	cmpdi	rCHAR1, 0
423	sldi	rADDR1, rCHAR1, 2
424	sldi	rADDR2, rCHAR2, 2
425	lwzx	rLWR1, rLOC, rADDR1
426	lwzx	rLWR2, rLOC, rADDR2
427	cmpw	cr1, rLWR1, rLWR2
428	crorc	4*cr1+eq,eq,4*cr1+eq
429	beq	cr1, L(done)
430	addi	rSTR1, rSTR1, 1
431	addi	rSTR2, rSTR2, 1
432	lbz	rCHAR1, 0(rSTR1)
433	lbz	rCHAR2, 0(rSTR2)
434	bdnz	L(loop1)
435#endif
436L(done):
437	subf	r0, rLWR2, rLWR1
438	extsw	rRTN, r0
439	blr
440END (__STRCASECMP)
441
442weak_alias (__STRCASECMP, STRCASECMP)
443libc_hidden_builtin_def (__STRCASECMP)
444