1/* Optimized strstr implementation for PowerPC64/POWER7.
2   Copyright (C) 2015-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* Char * [r3] strstr (char *s [r3], char * pat[r4])  */
22
23/* The performance gain is obtained using aligned memory access, load
24 * doubleword and usage of cmpb instruction for quicker comparison.  */
25
26#define ITERATIONS	64
27
28#ifndef STRSTR
29# define STRSTR strstr
30#endif
31
32#ifndef STRLEN
33/* For builds with no IFUNC support, local calls should be made to internal
34   GLIBC symbol (created by libc_hidden_builtin_def).  */
35# ifdef SHARED
36#  define STRLEN   __GI_strlen
37#  define STRLEN_is_local
38# else
39#  define STRLEN   strlen
40# endif
41#endif
42
43#ifndef STRNLEN
44/* For builds with no IFUNC support, local calls should be made to internal
45   GLIBC symbol (created by libc_hidden_builtin_def).  */
46# ifdef SHARED
47#  define STRNLEN   __GI_strnlen
48#  define STRNLEN_is_local
49# else
50#  define STRNLEN  __strnlen
51# endif
52#endif
53
54#ifndef STRCHR
55# ifdef SHARED
56#  define STRCHR   __GI_strchr
57#  define STRCHR_is_local
58# else
59#  define STRCHR   strchr
60# endif
61#endif
62
63#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
64	.machine  power7
65/* Can't be ENTRY_TOCLESS due to calling __strstr_ppc which uses r2.  */
66ENTRY (STRSTR, 4)
67	CALL_MCOUNT 2
68	mflr	r0			/* Load link register LR to r0.  */
69	std	r31, -8(r1)		/* Save callers register r31.  */
70	std	r30, -16(r1)		/* Save callers register r30.  */
71	std	r29, -24(r1)		/* Save callers register r29.  */
72	std	r28, -32(r1)		/* Save callers register r28.  */
73	std	r0, 16(r1)		/* Store the link register.  */
74	cfi_offset(r31, -8)
75	cfi_offset(r30, -16)
76	cfi_offset(r28, -32)
77	cfi_offset(r29, -24)
78	cfi_offset(lr, 16)
79	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
80	cfi_adjust_cfa_offset(FRAMESIZE)
81
82	dcbt	0, r3
83	dcbt	0, r4
84	cmpdi	cr7, r3, 0
85	beq	cr7, L(retnull)
86	cmpdi	cr7, r4, 0
87	beq	cr7, L(retnull)
88
89	mr	r29, r3
90	mr	r30, r4
91	mr	r3, r4
92	bl	STRLEN
93#ifndef STRLEN_is_local
94	nop
95#endif
96
97	cmpdi	cr7, r3, 0	/* If search str is null.  */
98	beq	cr7, L(ret_r3)
99
100	mr	r31, r3
101	mr	r4, r3
102	mr	r3, r29
103	bl	STRNLEN
104#ifndef STRNLEN_is_local
105	nop
106#endif
107
108	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
109	blt	cr7, L(retnull)
110	mr	r3, r29
111	lbz	r4, 0(r30)
112	bl	STRCHR
113#ifndef STRCHR_is_local
114	nop
115#endif
116
117	mr	r11, r3
118	/* If first char of search str is not present.  */
119	cmpdi	cr7, r3, 0
120	ble	cr7, L(end)
121	/* Reg r28 is used to count the number of iterations. */
122	li	r28, 0
123	rldicl	r8, r3, 0, 52	/* Page cross check.  */
124	cmpldi	cr7, r8, 4096-16
125	bgt	cr7, L(bytebybyte)
126
127	rldicl	r8, r30, 0, 52
128	cmpldi	cr7, r8, 4096-16
129	bgt	cr7, L(bytebybyte)
130
131	/* If len(r4) < 8 handle in a different way.  */
132	/* Shift position based on null and use cmpb.  */
133	cmpdi	cr7, r31, 8
134	blt	cr7, L(lessthan8)
135
136	/* Len(r4) >= 8 reaches here.  */
137	mr	r8, r3		/* Save r3 for future use.  */
138	mr	r4, r30		/* Restore r4.  */
139	li	r0, 0
140	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
141	clrrdi	r4, r4, 3	/* Make r4 aligned to 8.  */
142	ld	r6, 0(r4)
143	addi	r4, r4, 8
144	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
145	beq	cr7, L(begin1)
146#ifdef __LITTLE_ENDIAN__
147	srd	r6, r6, r10	/* Discard unwanted bits.  */
148#else
149	sld	r6, r6, r10
150#endif
151	ld	r9, 0(r4)
152	subfic	r10, r10, 64
153#ifdef __LITTLE_ENDIAN__
154	sld	r9, r9, r10	/* Discard unwanted bits.  */
155#else
156	srd	r9, r9, r10
157#endif
158	or	r6, r6, r9	/* Form complete search str.  */
159L(begin1):
160	mr	r29, r6
161	rlwinm	r10, r3, 3, 26, 28
162	clrrdi	r3, r3, 3
163	ld	r5, 0(r3)
164	cmpb	r9, r0, r6	/* Check if input has null.  */
165	cmpdi	cr7, r9, 0
166	bne	cr7, L(return3)
167	cmpb	r9, r0, r5	/* Check if input has null.  */
168#ifdef __LITTLE_ENDIAN__
169	srd	r9, r9, r10
170#else
171	sld	r9, r9, r10
172#endif
173	cmpdi	cr7, r9, 0
174	bne	cr7, L(retnull)
175
176	li	r12, -8		/* Shift values.  */
177	li	r11, 72		/* Shift values.  */
178	cmpdi	cr7, r10, 0
179	beq	cr7, L(nextbyte1)
180	mr	r12, r10
181	addi	r12, r12, -8
182	subfic	r11, r12, 64
183
184L(nextbyte1):
185	ldu	r7, 8(r3) 	/* Load next dw.  */
186	addi	r12, r12, 8	/* Shift one byte and compare.  */
187	addi	r11, r11, -8
188#ifdef __LITTLE_ENDIAN__
189	srd	r9, r5, r12	/* Rotate based on mask.  */
190	sld	r10, r7, r11
191#else
192	sld	r9, r5, r12
193	srd	r10, r7, r11
194#endif
195	/* Form single dw from few bytes on first load and second load.  */
196	or	r10, r9, r10
197	/* Check for null in the formed dw.  */
198	cmpb	r9, r0, r10
199	cmpdi	cr7, r9, 0
200	bne	cr7, L(retnull)
201	/* Cmpb search str and input str.  */
202	cmpb	r9, r10, r6
203	cmpdi	cr7, r9, -1
204	beq	cr7, L(match)
205	addi	r8, r8, 1
206	b	L(begin)
207
208	.align	4
209L(match):
210	/* There is a match of 8 bytes, check next bytes.  */
211	cmpdi	cr7, r31, 8
212	beq	cr7, L(return)
213	/* Update next starting point r8.  */
214	srdi	r9, r11, 3
215	subf	r9, r9, r3
216	mr	r8, r9
217
218L(secondmatch):
219	mr	r5, r7
220	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
221	ld	r6, 0(r4)
222	addi	r4, r4, 8
223	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
224	beq	cr7, L(proceed3)
225#ifdef __LITTLE_ENDIAN__
226	srd	r6, r6, r10	/* Discard unwanted bits.  */
227	cmpb	r9, r0, r6
228	sld	r9, r9, r10
229#else
230	sld	r6, r6, r10
231	cmpb	r9, r0, r6
232	srd	r9, r9, r10
233#endif
234	cmpdi	cr7, r9, 0
235	bne	cr7, L(proceed3)
236	ld	r9, 0(r4)
237	subfic	r10, r10, 64
238#ifdef __LITTLE_ENDIAN__
239	sld	r9, r9, r10	/* Discard unwanted bits.  */
240#else
241	srd	r9, r9, r10
242#endif
243	or	r6, r6, r9	/* Form complete search str.  */
244
245L(proceed3):
246	li	r7, 0
247	addi	r3, r3, 8
248	cmpb	r9, r0, r5
249	cmpdi	cr7, r9, 0
250	bne	cr7, L(proceed4)
251	ld	r7, 0(r3)
252L(proceed4):
253#ifdef __LITTLE_ENDIAN__
254	srd	r9, r5, r12
255	sld	r10, r7, r11
256#else
257	sld	r9, r5, r12
258	srd	r10, r7, r11
259#endif
260	/* Form single dw with few bytes from first and second load.  */
261	or	r10, r9, r10
262	cmpb	r9, r0, r6
263	cmpdi	cr7, r9, 0
264	bne	cr7, L(return4)
265	/* Check for null in the formed dw.  */
266	cmpb	r9, r0, r10
267	cmpdi	cr7, r9, 0
268	bne	cr7, L(retnull)
269	/* If the next 8 bytes dont match, start search again.  */
270	cmpb	r9, r10, r6
271	cmpdi	cr7, r9, -1
272	bne	cr7, L(reset)
273	/* If the next 8 bytes match, load and compare next 8.  */
274	b	L(secondmatch)
275
276	.align	4
277L(reset):
278	/* Start the search again.  */
279	addi	r8, r8, 1
280	b	L(begin)
281
282	.align	4
283L(return3):
284	/* Count leading zeros and compare partial dw.  */
285#ifdef __LITTLE_ENDIAN__
286	addi	r7, r9, -1
287	andc	r7, r7, r9
288	popcntd	r7, r7
289	subfic	r7, r7, 64
290	sld	r10, r5, r7
291	sld	r6, r6, r7
292#else
293	cntlzd	r7, r9
294	subfic	r7, r7, 64
295	srd	r10, r5, r7
296	srd	r6, r6, r7
297#endif
298	cmpb	r9, r10, r6
299	cmpdi	cr7, r9, -1
300	addi	r8, r8, 1
301	/* Start search again if there is no match.  */
302	bne	cr7, L(begin)
303	/* If the words match, update return values.  */
304	subfic	r7, r7, 64
305	srdi	r7, r7, 3
306	add	r3, r3, r7
307	subf	r3, r31, r3
308	b	L(end)
309
310	.align	4
311L(return4):
312	/* Count leading zeros and compare partial dw.  */
313#ifdef __LITTLE_ENDIAN__
314	addi	r7, r9, -1
315	andc	r7, r7, r9
316	popcntd	r7, r7
317	subfic	r7, r7, 64
318	sld	r10, r10, r7
319	sld	r6, r6, r7
320#else
321	cntlzd	r7, r9
322	subfic	r7, r7, 64
323	srd	r10, r10, r7
324	srd	r6, r6, r7
325#endif
326	cmpb	r9, r10, r6
327	cmpdi	cr7, r9, -1
328	addi	r8, r8, 1
329	bne	cr7, L(begin)
330	subfic	r7, r7, 64
331	srdi	r11, r11, 3
332	subf	r3, r11, r3
333	srdi	r7, r7, 3
334	add	r3, r3, r7
335	subf	r3, r31, r3
336	b	L(end)
337
338	.align	4
339L(begin):
340	mr	r3, r8
341	/* When our iterations exceed ITERATIONS,fall back to default. */
342	addi	r28, r28, 1
343	cmpdi	cr7, r28, ITERATIONS
344	beq	cr7, L(default)
345	lbz	r4, 0(r30)
346	bl	STRCHR
347#ifndef STRCHR_is_local
348	nop
349#endif
350	/* If first char of search str is not present.  */
351	cmpdi	cr7, r3, 0
352	ble	cr7, L(end)
353	mr	r8, r3
354	mr	r4, r30		/* Restore r4.  */
355	li	r0, 0
356	mr	r6, r29
357	clrrdi	r4, r4, 3
358	addi	r4, r4, 8
359	b	L(begin1)
360
361	/* Handle less than 8 search string.  */
362	.align	4
363L(lessthan8):
364	mr	r4, r3
365	mr	r9, r30
366	li	r0, 0
367
368	rlwinm	r10, r9, 3, 26, 28	/* Calculate padding in bits.  */
369	srdi	r8, r10, 3	/* Padding in bytes.  */
370	clrrdi	r9, r9, 3	/* Make r4 aligned to 8.  */
371	ld	r6, 0(r9)
372	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
373	beq	cr7, L(proceed2)
374#ifdef __LITTLE_ENDIAN__
375	srd	r6, r6, r10	/* Discard unwanted bits.  */
376#else
377	sld	r6, r6, r10
378#endif
379	subfic	r8, r8, 8
380	cmpd	cr7, r8, r31	/* Next load needed?  */
381	bge	cr7, L(proceed2)
382	ld	r7, 8(r9)
383	subfic	r10, r10, 64
384#ifdef __LITTLE_ENDIAN__
385	sld	r7, r7, r10	/* Discard unwanted bits.  */
386#else
387	srd	r7, r7, r10
388#endif
389	or	r6, r6, r7	/* Form complete search str.  */
390L(proceed2):
391	mr	r29, r6
392	rlwinm	r10, r3, 3, 26, 28
393	clrrdi	r7, r3, 3	/* Make r3 aligned.  */
394	ld	r5, 0(r7)
395	sldi	r8, r31, 3
396	subfic	r8, r8, 64
397#ifdef __LITTLE_ENDIAN__
398	sld	r6, r6, r8
399	cmpb	r9, r0, r5
400	srd	r9, r9, r10
401#else
402	srd	r6, r6, r8
403	cmpb	r9, r0, r5
404	sld	r9, r9, r10
405#endif
406	cmpdi	cr7, r9, 0
407	bne	cr7, L(noload)
408	cmpdi	cr7, r10, 0
409	beq	cr7, L(continue)
410	ld	r7, 8(r7)
411L(continue1):
412	mr	r12, r10
413	addi	r12, r12, -8
414	subfic	r11, r12, 64
415	b	L(nextbyte)
416
417	.align	4
418L(continue):
419	ld	r7, 8(r7)
420	li	r12, -8		/* Shift values.  */
421	li	r11, 72		/* Shift values.  */
422L(nextbyte):
423	addi	r12, r12, 8	/* Mask for rotation.  */
424	addi	r11, r11, -8
425#ifdef __LITTLE_ENDIAN__
426	srd	r9, r5, r12
427	sld	r10, r7, r11
428	or	r10, r9, r10
429	sld	r10, r10, r8
430	cmpb	r9, r0, r10
431	srd	r9, r9, r8
432#else
433	sld	r9, r5, r12
434	srd	r10, r7, r11
435	or	r10, r9, r10
436	srd	r10, r10, r8
437	cmpb	r9, r0, r10
438	sld	r9, r9, r8
439#endif
440	cmpdi	cr7, r9, 0
441	bne	cr7, L(retnull)
442	cmpb	r9, r10, r6
443	cmpdi	cr7, r9, -1
444	beq	cr7, L(end)
445	addi	r3, r4, 1
446	/* When our iterations exceed ITERATIONS,fall back to default. */
447	addi	r28, r28, 1
448	cmpdi	cr7, r28, ITERATIONS
449	beq	cr7, L(default)
450	lbz	r4, 0(r30)
451	bl	STRCHR
452#ifndef STRCHR_is_local
453	nop
454#endif
455	/* If first char of search str is not present.  */
456	cmpdi	cr7, r3, 0
457	ble	cr7, L(end)
458	mr	r4, r3
459	mr	r6, r29
460	li	r0, 0
461	b	L(proceed2)
462
463	.align	4
464L(noload):
465	/* Reached null in r3, so skip next load.  */
466	li 	r7, 0
467	b	L(continue1)
468
469	.align	4
470L(return):
471	/* Update return values.  */
472	srdi	r9, r11, 3
473	subf	r3, r9, r3
474	b	L(end)
475
476	/* Handling byte by byte.  */
477	.align	4
478L(bytebybyte):
479	mr	r8, r3
480	addi	r8, r8, -1
481L(loop1):
482	addi	r8, r8, 1
483	mr	r3, r8
484	mr	r4, r30
485	lbz	r6, 0(r4)
486	cmpdi	cr7, r6, 0
487	beq	cr7, L(updater3)
488L(loop):
489	lbz	r5, 0(r3)
490	cmpdi	cr7, r5, 0
491	beq	cr7, L(retnull)
492	cmpld	cr7, r6, r5
493	bne	cr7, L(loop1)
494	addi	r3, r3, 1
495	addi	r4, r4, 1
496	lbz	r6, 0(r4)
497	cmpdi	cr7, r6, 0
498	beq	cr7, L(updater3)
499	b	L(loop)
500
501	/* Handling return values.  */
502	.align	4
503L(updater3):
504	subf	r3, r31, r3	/* Reduce len of r4 from r3.  */
505	b	L(end)
506
507	.align	4
508L(ret_r3):
509	mr	r3, r29		/* Return r3.  */
510	b	L(end)
511
512	.align	4
513L(retnull):
514	li	r3, 0		/* Return NULL.  */
515	b	L(end)
516
517	.align	4
518L(default):
519	mr	r4, r30
520	bl	__strstr_ppc
521	nop
522
523	.align	4
524L(end):
525	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
526	cfi_adjust_cfa_offset(-FRAMESIZE)
527	ld	r0, 16(r1)	/* Restore the saved link register.  */
528	ld	r28, -32(r1)	/* Restore callers save register r28.  */
529	ld	r29, -24(r1)	/* Restore callers save register r29.  */
530	ld	r30, -16(r1)	/* Restore callers save register r30.  */
531	ld	r31, -8(r1)	/* Restore callers save register r31.  */
532	mtlr	r0		/* Branch to link register.  */
533	blr
534END (STRSTR)
535libc_hidden_builtin_def (strstr)
536