1/* Optimized strcasestr implementation for PowerPC64/POWER8.
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <locale-defines.h>
21
22/* Char * [r3] strcasestr (char *s [r3], char * pat[r4])  */
23
24/* The performance gain is obtained by comparing 16 bytes.  */
25
26/* When the first char of r4 is hit ITERATIONS times in r3
27   fallback to default.  */
28#define ITERATIONS	64
29
30#ifndef STRCASESTR
31# define STRCASESTR __strcasestr
32#endif
33
34#ifndef STRLEN
35/* For builds without IFUNC support, local calls should be made to internal
36   GLIBC symbol (created by libc_hidden_builtin_def).  */
37# ifdef SHARED
38#  define STRLEN   __GI_strlen
39# else
40#  define STRLEN   strlen
41# endif
42#endif
43
44#ifndef STRNLEN
45/* For builds without IFUNC support, local calls should be made to internal
46   GLIBC symbol (created by libc_hidden_builtin_def).  */
47# ifdef SHARED
48#  define STRNLEN   __GI_strnlen
49# else
50#  define STRNLEN    __strnlen
51# endif
52#endif
53
54#ifndef STRCHR
55# ifdef SHARED
56#  define STRCHR   __GI_strchr
57# else
58#  define STRCHR   strchr
59# endif
60#endif
61
62/* Convert 16 bytes of v4 and reg to lowercase and compare.  */
63#define TOLOWER(reg)     \
64	vcmpgtub	v6, v4, v1; \
65	vcmpgtub	v7, v2, v4; \
66	vand	v8, v7, v6; \
67	vand	v8, v8, v3; \
68	vor	v4, v8, v4; \
69	vcmpgtub	v6, reg, v1; \
70	vcmpgtub	v7, v2, reg; \
71	vand	v8, v7, v6; \
72	vand	v8, v8, v3; \
73	vor	reg, v8, reg; \
74	vcmpequb.	v6, reg, v4;
75
76#define	FRAMESIZE	(FRAME_MIN_SIZE+48)
77	.machine  power8
78ENTRY (STRCASESTR, 4)
79	CALL_MCOUNT 2
80	mflr	r0			/* Load link register LR to r0.  */
81	std	r31, -8(r1)		/* Save callers register r31.  */
82	std	r30, -16(r1)		/* Save callers register r30.  */
83	std	r29, -24(r1)		/* Save callers register r29.  */
84	std	r28, -32(r1)		/* Save callers register r28.  */
85	std	r27, -40(r1)		/* Save callers register r27.  */
86	std	r0, 16(r1)		/* Store the link register.  */
87	cfi_offset(r31, -8)
88	cfi_offset(r30, -16)
89	cfi_offset(r29, -24)
90	cfi_offset(r28, -32)
91	cfi_offset(r27, -40)
92	cfi_offset(lr, 16)
93	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
94	cfi_adjust_cfa_offset(FRAMESIZE)
95
96	dcbt	0, r3
97	dcbt	0, r4
98	cmpdi	cr7, r3, 0		/* Input validation.  */
99	beq	cr7, L(retnull)
100	cmpdi	cr7, r4, 0
101	beq	cr7, L(retnull)
102
103	mr	r29, r3
104	mr	r30, r4
105	/* Load first byte from r4 and check if its null.  */
106	lbz	r6, 0(r4)
107	cmpdi	cr7, r6, 0
108	beq	cr7, L(ret_r3)
109
110	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
111	add	r9, r10, __libc_tsd_LOCALE@tls
112	ld	r9, 0(r9)
113	ld	r9, LOCALE_CTYPE_TOUPPER(r9)
114	sldi	r10, r6, 2		/* Convert to upper case.  */
115	lwzx	r28, r9, r10
116
117	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
118	add	r11, r10, __libc_tsd_LOCALE@tls
119	ld	r11, 0(r11)
120	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
121	sldi	r10, r6, 2              /* Convert to lower case.  */
122	lwzx	r27, r11, r10
123
124	/* Check if the first char is present.  */
125	mr	r4, r27
126	bl	STRCHR
127	nop
128	mr	r5, r3
129	mr	r3, r29
130	mr	r29, r5
131	mr	r4, r28
132	bl	STRCHR
133	nop
134	cmpdi	cr7, r29, 0
135	beq	cr7, L(firstpos)
136	cmpdi	cr7, r3, 0
137	beq	cr7, L(skipcheck)
138	cmpw	cr7, r3, r29
139	ble 	cr7, L(firstpos)
140	/* Move r3 to the first occurence.  */
141L(skipcheck):
142	mr	r3, r29
143L(firstpos):
144	mr	r29, r3
145
146	sldi	r9, r27, 8
147	or	r28, r9, r28
148	/* Reg r27 is used to count the number of iterations.  */
149	li	r27, 0
150	/* If first char of search str is not present.  */
151	cmpdi	cr7, r3, 0
152	ble	cr7, L(end)
153
154	/* Find the length of pattern.  */
155	mr	r3, r30
156	bl	STRLEN
157	nop
158
159	cmpdi	cr7, r3, 0	/* If search str is null.  */
160	beq	cr7, L(ret_r3)
161
162	mr	r31, r3
163	mr	r4, r3
164	mr	r3, r29
165	bl	STRNLEN
166	nop
167
168	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
169	blt	cr7, L(retnull)
170
171	mr	r3, r29
172
173	/* Locales not matching ASCII for single bytes.  */
174	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
175	add	r9, r10, __libc_tsd_LOCALE@tls
176	ld	r9, 0(r9)
177	ld	r7, 0(r9)
178	addi	r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
179	lwz	r8, 0(r7)
180	cmpdi	cr7, r8, 1
181	beq	cr7, L(bytebybyte)
182
183	/* If len(r4) < 16 handle byte by byte.  */
184	/* For shorter strings we will not use vector registers.  */
185	cmpdi	cr7, r31, 16
186	blt	cr7, L(bytebybyte)
187
188	/* Comparison values used for TOLOWER.  */
189	/* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte.  */
190	vspltish	v0, 0
191	vspltisb	v5, 2
192	vspltisb	v4, 4
193	vsl	v3, v5, v4
194	vaddubm	v1, v3, v3
195	vspltisb	v5, 15
196	vaddubm	v2, v5, v5
197	vaddubm	v2, v1, v2
198	vspltisb	v4, -3
199	vaddubm	v2, v2, v4
200
201	/*
202	1. Load 16 bytes from r3 and r4
203	2. Check if there is null, If yes, proceed byte by byte path.
204	3. Else,Convert both to lowercase and compare.
205	4. If they are same proceed to 1.
206	5. If they dont match, find if first char of r4 is present in the
207	   loaded 16 byte of r3.
208	6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
209	*/
210
211	mr	r8, r3		/* Save r3 for future use.  */
212	mr	r4, r30		/* Restore r4.  */
213	clrldi	r10, r4, 60
214	lvx	v5, 0, r4	/* Load 16 bytes from r4.  */
215	cmpdi	cr7, r10, 0
216	beq	cr7, L(begin2)
217	/* If r4 is unaligned, load another 16 bytes.  */
218#ifdef __LITTLE_ENDIAN__
219	lvsr	v7, 0, r4
220#else
221	lvsl	v7, 0, r4
222#endif
223	addi	r5, r4, 16
224	lvx	v9, 0, r5
225#ifdef __LITTLE_ENDIAN__
226	vperm	v5, v9, v5, v7
227#else
228	vperm	v5, v5, v9, v7
229#endif
230L(begin2):
231	lvx	v4, 0, r3
232	vcmpequb.	v7, v0, v4	/* Check for null.  */
233	beq	cr6, L(nullchk6)
234	b	L(trailcheck)
235
236        .align  4
237L(nullchk6):
238	clrldi	r10, r3, 60
239	cmpdi	cr7, r10, 0
240	beq	cr7, L(next16)
241#ifdef __LITTLE_ENDIAN__
242	lvsr	v7, 0, r3
243#else
244	lvsl	v7, 0, r3
245#endif
246	addi	r5, r3, 16
247	/* If r3 is unaligned, load another 16 bytes.  */
248	lvx	v10, 0, r5
249#ifdef __LITTLE_ENDIAN__
250	vperm	v4, v10, v4, v7
251#else
252	vperm	v4, v4, v10, v7
253#endif
254L(next16):
255	vcmpequb.	v6, v0, v5	/* Check for null.  */
256	beq	cr6, L(nullchk)
257	b	L(trailcheck)
258
259	.align	4
260L(nullchk):
261	vcmpequb.	v6, v0, v4
262	beq	cr6, L(nullchk1)
263	b	L(retnull)
264
265	.align	4
266L(nullchk1):
267	/* Convert both v3 and v4 to lower.  */
268	TOLOWER(v5)
269	/* If both are same, branch to match.  */
270	blt	cr6, L(match)
271	/* Find if the first char is present in next 15 bytes.  */
272#ifdef __LITTLE_ENDIAN__
273	vspltb	v6, v5, 15
274	vsldoi	v7, v0, v4, 15
275#else
276	vspltb	v6, v5, 0
277	vspltisb	v7, 8
278	vslo	v7, v4, v7
279#endif
280	vcmpequb	v7, v6, v7
281	vcmpequb.	v6, v0, v7
282	/* Shift r3 by 16 bytes and proceed.  */
283	blt	cr6, L(shift16)
284	vclzd	v8, v7
285#ifdef __LITTLE_ENDIAN__
286	vspltb	v6, v8, 15
287#else
288	vspltb	v6, v8, 7
289#endif
290	vcmpequb.	v6, v6, v1
291	/* Shift r3 by 8  bytes and proceed.  */
292	blt	cr6, L(shift8)
293	b	L(begin)
294
295	.align	4
296L(match):
297	/* There is a match of 16 bytes, check next bytes.  */
298	cmpdi	cr7, r31, 16
299	mr	r29, r3
300	beq	cr7, L(ret_r3)
301
302L(secondmatch):
303	addi	r3, r3, 16
304	addi	r4, r4, 16
305	/* Load next 16 bytes of r3 and r4 and compare.  */
306	clrldi	r10, r4, 60
307	cmpdi	cr7, r10, 0
308	beq	cr7, L(nextload)
309	/* Handle unaligned case.  */
310	vor	v6, v9, v9
311	vcmpequb.	v7, v0, v6
312	beq	cr6, L(nullchk2)
313	b	L(trailcheck)
314
315	.align	4
316L(nullchk2):
317#ifdef __LITTLE_ENDIAN__
318	lvsr	v7, 0, r4
319#else
320	lvsl	v7, 0, r4
321#endif
322	addi	r5, r4, 16
323	/* If r4 is unaligned, load another 16 bytes.  */
324	lvx	v9, 0, r5
325#ifdef __LITTLE_ENDIAN__
326	vperm	v11, v9, v6, v7
327#else
328	vperm	v11, v6, v9, v7
329#endif
330	b	L(compare)
331
332	.align	4
333L(nextload):
334	lvx	v11, 0, r4
335L(compare):
336	vcmpequb.	v7, v0, v11
337	beq	cr6, L(nullchk3)
338	b	L(trailcheck)
339
340	.align	4
341L(nullchk3):
342	clrldi	r10, r3, 60
343	cmpdi 	cr7, r10, 0
344	beq 	cr7, L(nextload1)
345	/* Handle unaligned case.  */
346	vor	v4, v10, v10
347	vcmpequb.	v7, v0, v4
348	beq	cr6, L(nullchk4)
349	b	L(retnull)
350
351	.align	4
352L(nullchk4):
353#ifdef __LITTLE_ENDIAN__
354	lvsr	v7, 0, r3
355#else
356	lvsl	v7, 0, r3
357#endif
358	addi	r5, r3, 16
359	/* If r3 is unaligned, load another 16 bytes.  */
360	lvx	v10, 0, r5
361#ifdef __LITTLE_ENDIAN__
362	vperm	v4, v10, v4, v7
363#else
364	vperm	v4, v4, v10, v7
365#endif
366	b	L(compare1)
367
368	.align	4
369L(nextload1):
370	lvx	v4, 0, r3
371L(compare1):
372	vcmpequb.	v7, v0, v4
373	beq	cr6, L(nullchk5)
374	b	L(retnull)
375
376	.align	4
377L(nullchk5):
378	/* Convert both v3 and v4 to lower.  */
379	TOLOWER(v11)
380	/* If both are same, branch to secondmatch.  */
381	blt 	cr6, L(secondmatch)
382	/* Continue the search.  */
383        b	L(begin)
384
385	.align	4
386L(trailcheck):
387	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
388	add	r11, r10, __libc_tsd_LOCALE@tls
389	ld	r11, 0(r11)
390	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
391L(loop2):
392	lbz	r5, 0(r3)               /* Load byte from r3.  */
393	lbz	r6, 0(r4)               /* Load next byte from r4.  */
394	cmpdi 	cr7, r6, 0              /* Is it null?  */
395	beq 	cr7, L(updater3)
396	cmpdi 	cr7, r5, 0              /* Is it null?  */
397	beq 	cr7, L(retnull)         /* If yes, return.  */
398	addi	r3, r3, 1
399	addi	r4, r4, 1               /* Increment r4.  */
400	sldi	r10, r5, 2              /* Convert to lower case.  */
401	lwzx	r10, r11, r10
402	sldi	r7, r6, 2               /* Convert to lower case.  */
403	lwzx	r7, r11, r7
404	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
405	bne	cr7, L(begin)
406	b	L(loop2)
407
408	.align	4
409L(shift8):
410	addi	r8, r8, 7
411	b	L(begin)
412	.align	4
413L(shift16):
414	addi	r8, r8, 15
415	.align	4
416L(begin):
417	addi	r8, r8, 1
418	mr	r3, r8
419	/* When our iterations exceed ITERATIONS,fall back to default.  */
420	addi	r27, r27, 1
421	cmpdi	cr7, r27, ITERATIONS
422	beq	cr7, L(default)
423	mr	r4, r30         /* Restore r4.  */
424	b	L(begin2)
425
426	/* Handling byte by byte.  */
427	.align	4
428L(loop1):
429	mr	r3, r8
430	addi	r27, r27, 1
431	cmpdi	cr7, r27, ITERATIONS
432	beq	cr7, L(default)
433	mr	r29, r8
434	srdi	r4, r28, 8
435	/* Check if the first char is present.  */
436	bl	STRCHR
437	nop
438	mr	r5, r3
439	mr	r3, r29
440	mr	r29, r5
441	sldi	r4, r28, 56
442	srdi	r4, r4, 56
443	bl	STRCHR
444	nop
445	cmpdi	cr7, r29, 0
446	beq	cr7, L(nextpos)
447	cmpdi	cr7, r3, 0
448	beq	cr7, L(skipcheck1)
449	cmpw	cr7, r3, r29
450	ble 	cr7, L(nextpos)
451	/* Move r3 to first occurence.  */
452L(skipcheck1):
453	mr	r3, r29
454L(nextpos):
455	mr	r29, r3
456	cmpdi 	cr7, r3, 0
457	ble 	cr7, L(retnull)
458L(bytebybyte):
459	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
460	add	r11, r10, __libc_tsd_LOCALE@tls
461	ld	r11, 0(r11)
462	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
463	mr	r4, r30                 /* Restore r4.  */
464	mr	r8, r3                  /* Save r3.  */
465	addi	r8, r8, 1
466
467L(loop):
468	addi	r3, r3, 1
469	lbz	r5, 0(r3)               /* Load byte from r3.  */
470	addi	r4, r4, 1               /* Increment r4.  */
471	lbz	r6, 0(r4)               /* Load next byte from r4.  */
472	cmpdi 	cr7, r6, 0              /* Is it null?  */
473	beq 	cr7, L(updater3)
474	cmpdi 	cr7, r5, 0              /* Is it null?  */
475	beq 	cr7, L(retnull)         /* If yes, return.  */
476	sldi	r10, r5, 2              /* Convert to lower case.  */
477	lwzx	r10, r11, r10
478	sldi	r7, r6, 2               /* Convert to lower case.  */
479	lwzx	r7, r11, r7
480	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
481	bne 	cr7, L(loop1)
482	b	L(loop)
483
484	/* Handling return values.  */
485	.align	4
486L(updater3):
487	subf	r3, r31, r3	/* Reduce r31 (len of r4) from r3.  */
488	b	L(end)
489
490	.align	4
491L(ret_r3):
492	mr	r3, r29		/* Return point of match.  */
493	b	L(end)
494
495	.align	4
496L(retnull):
497	li	r3, 0		/* Substring was not found.  */
498	b	L(end)
499
500	.align	4
501L(default):
502	mr	r4, r30
503	bl	__strcasestr_ppc
504	nop
505
506	.align	4
507L(end):
508	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
509	cfi_adjust_cfa_offset(-FRAMESIZE)
510	ld	r0, 16(r1)	/* Restore the saved link register.  */
511	ld	r27, -40(r1)
512	ld	r28, -32(r1)
513	ld	r29, -24(r1)	/* Restore callers save register r29.  */
514	ld	r30, -16(r1)	/* Restore callers save register r30.  */
515	ld	r31, -8(r1)	/* Restore callers save register r31.  */
516	cfi_restore(lr)
517	cfi_restore(r27)
518	cfi_restore(r28)
519	cfi_restore(r29)
520	cfi_restore(r30)
521	cfi_restore(r31)
522	mtlr	r0		/* Branch to link register.  */
523	blr
524END (STRCASESTR)
525
526weak_alias (__strcasestr, strcasestr)
527libc_hidden_def (__strcasestr)
528libc_hidden_builtin_def (strcasestr)
529