1/* Optimized strchr implementation for PowerPC64/POWER8.
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STRCHRNUL
22# ifndef STRCHRNUL
23#   define FUNC_NAME __strchrnul
24# else
25#   define FUNC_NAME STRCHRNUL
26# endif
27#else
28# ifndef STRCHR
29#  define FUNC_NAME strchr
30# else
31#  define FUNC_NAME STRCHR
32# endif
33#endif  /* !USE_AS_STRCHRNUL  */
34
35/* int [r3] strchr (char *s [r3], int c [r4])  */
36	.machine  power8
37ENTRY_TOCLESS (FUNC_NAME)
38	CALL_MCOUNT 2
39	dcbt	0,r3
40	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
41	cmpdi	cr7,r4,0
42	ld	r12,0(r8)     /* Load doubleword from memory.  */
43	li	r0,0	      /* Doubleword with null chars to use
44				 with cmpb.  */
45
46	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
47
48	beq	cr7,L(null_match)
49
50	/* Replicate byte to doubleword.  */
51	insrdi	r4,r4,8,48
52	insrdi	r4,r4,16,32
53	insrdi  r4,r4,32,0
54
55	/* Now r4 has a doubleword of c bytes and r0 has
56	   a doubleword of null bytes.  */
57
58	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
59	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
60
61	/* Move the doublewords left and right to discard the bits that are
62	   not part of the string and bring them back as zeros.  */
63#ifdef __LITTLE_ENDIAN__
64	srd	r10,r10,r6
65	srd	r11,r11,r6
66	sld	r10,r10,r6
67	sld	r11,r11,r6
68#else
69	sld	r10,r10,r6
70	sld	r11,r11,r6
71	srd	r10,r10,r6
72	srd	r11,r11,r6
73#endif
74	or	r5,r10,r11    /* OR the results to speed things up.  */
75	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
76				 have been found.  */
77	bne	cr7,L(done)
78
79	mtcrf   0x01,r8
80
81	/* Are we now aligned to a doubleword boundary?  If so, skip to
82	   the main loop.  Otherwise, go through the alignment code.  */
83
84	bt	28,L(loop)
85
86	/* Handle WORD2 of pair.  */
87	ldu	r12,8(r8)
88	cmpb    r10,r12,r4
89	cmpb	r11,r12,r0
90	or	r5,r10,r11
91	cmpdi	cr7,r5,0
92	bne	cr7,L(done)
93	b	L(loop)	      /* We branch here (rather than falling through)
94				 to skip the nops due to heavy alignment
95				 of the loop below.  */
96
97	.p2align  5
98L(loop):
99	/* Load two doublewords, compare and merge in a
100	   single register for speed.  This is an attempt
101	   to speed up the null-checking process for bigger strings.  */
102	ld	r12,8(r8)
103	ldu	r9,16(r8)
104	cmpb	r10,r12,r4
105	cmpb	r11,r12,r0
106	cmpb	r6,r9,r4
107	cmpb	r7,r9,r0
108	or	r5,r10,r11
109	or	r9,r6,r7
110	or	r12,r5,r9
111	cmpdi	cr7,r12,0
112	beq	cr7,L(vector)
113	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
114	   the first doubleword and decrement the address in case the first
115	   doubleword really contains a c/null byte.  */
116
117	cmpdi	cr6,r5,0
118	addi	r8,r8,-8
119	bne	cr6,L(done)
120
121	/* The c/null byte must be in the second doubleword.  Adjust the
122	   address again and move the result of cmpb to r10 so we can calculate
123	   the pointer.  */
124
125	mr	r10,r6
126	mr	r11,r7
127	addi	r8,r8,8
128#ifdef USE_AS_STRCHRNUL
129	mr	r5, r9
130#endif
131	/* r10/r11 have the output of the cmpb instructions, that is,
132	   0xff in the same position as the c/null byte in the original
133	   doubleword from the string.  Use that to calculate the pointer.  */
134L(done):
135#ifdef USE_AS_STRCHRNUL
136	mr	r10, r5
137#endif
138#ifdef __LITTLE_ENDIAN__
139	addi    r3,r10,-1
140	andc    r3,r3,r10
141	popcntd	r0,r3
142# ifndef USE_AS_STRCHRNUL
143	addi    r4,r11,-1
144	andc    r4,r4,r11
145	cmpld	cr7,r3,r4
146	bgt	cr7,L(no_match)
147# endif
148#else
149	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
150# ifndef USE_AS_STRCHRNUL
151	cmpld	cr7,r11,r10
152	bgt	cr7,L(no_match)
153# endif
154#endif
155	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
156	add	r3,r8,r0      /* Return address of the matching c byte
157				 or null in case c was not found.  */
158	blr
159
160	/* Check the first 32B in GPR's and move to vectorized loop.  */
161	.p2align  5
162L(vector):
163	addi	r3, r8, 8
164	andi.	r10, r3, 31
165	bne	cr0, L(loop)
166	vspltisb	v0, 0
167	/* Precompute vbpermq constant.  */
168	vspltisb	v10, 3
169	lvsl	v11, r0, r0
170	vslb	v10, v11, v10
171	mtvrd	v1, r4
172	li	r5, 16
173	vspltb	v1, v1, 7
174	/* Compare 32 bytes in each loop.  */
175L(continue):
176	lvx	v4, 0, r3
177	lvx	v5, r3, r5
178	vcmpequb	v2, v0, v4
179	vcmpequb	v3, v0, v5
180	vcmpequb	v6, v1, v4
181	vcmpequb	v7, v1, v5
182	vor	v8, v2, v3
183	vor	v9, v6, v7
184	vor	v11, v8, v9
185	vcmpequb.	v11, v0, v11
186	addi	r3, r3, 32
187	blt	cr6, L(continue)
188	/* One (or both) of the quadwords contains a c/null byte.  */
189	addi	r3, r3, -32
190#ifndef USE_AS_STRCHRNUL
191	vcmpequb.	v11, v0, v9
192	blt	cr6, L(no_match)
193#endif
194	/* Permute the first bit of each byte into bits 48-63.  */
195	vbpermq	v2, v2, v10
196	vbpermq	v3, v3, v10
197	vbpermq	v6, v6, v10
198	vbpermq	v7, v7, v10
199	/* Shift each component into its correct position for merging.  */
200#ifdef __LITTLE_ENDIAN__
201	vsldoi	v3, v3, v3, 2
202	vsldoi	v7, v7, v7, 2
203#else
204	vsldoi	v2, v2, v2, 6
205	vsldoi	v3, v3, v3, 4
206	vsldoi	v6, v6, v6, 6
207	vsldoi	v7, v7, v7, 4
208#endif
209
210        /* Merge the results and move to a GPR.  */
211        vor     v1, v3, v2
212        vor     v2, v6, v7
213        vor     v4, v1, v2
214	mfvrd	r5, v4
215#ifdef __LITTLE_ENDIAN__
216	addi	r6, r5, -1
217	andc	r6, r6, r5
218	popcntd	r6, r6
219#else
220	cntlzd	r6, r5	/* Count leading zeros before the match.  */
221#endif
222	add	r3, r3, r6	/* Compute final length.  */
223	/* Return NULL if null found before c.  */
224#ifndef USE_AS_STRCHRNUL
225	lbz	r4, 0(r3)
226	cmpdi	cr7, r4, 0
227	beq	cr7, L(no_match)
228#endif
229	blr
230
231#ifndef USE_AS_STRCHRNUL
232	.align	4
233L(no_match):
234	li	r3,0
235	blr
236#endif
237
238/* We are here because strchr was called with a null byte.  */
239	.align	4
240L(null_match):
241	/* r0 has a doubleword of null bytes.  */
242
243	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
244
245	/* Move the doublewords left and right to discard the bits that are
246	   not part of the string and bring them back as zeros.  */
247#ifdef __LITTLE_ENDIAN__
248	srd	r5,r5,r6
249	sld	r5,r5,r6
250#else
251	sld	r5,r5,r6
252	srd	r5,r5,r6
253#endif
254	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
255				 have been found.  */
256	bne	cr7,L(done_null)
257
258	mtcrf   0x01,r8
259
260	/* Are we now aligned to a quadword boundary?  If so, skip to
261	   the main loop.  Otherwise, go through the alignment code.  */
262
263	bt	28,L(loop_null)
264
265	/* Handle WORD2 of pair.  */
266	ldu	r12,8(r8)
267	cmpb    r5,r12,r0
268	cmpdi	cr7,r5,0
269	bne	cr7,L(done_null)
270	b	L(loop_null)  /* We branch here (rather than falling through)
271				 to skip the nops due to heavy alignment
272				 of the loop below.  */
273
274	/* Main loop to look for the end of the string.  Since it's a
275	   small loop (< 8 instructions), align it to 32-bytes.  */
276	.p2align  5
277L(loop_null):
278	/* Load two doublewords, compare and merge in a
279	   single register for speed.  This is an attempt
280	   to speed up the null-checking process for bigger strings.  */
281	ld	r12,8(r8)
282	ldu     r11,16(r8)
283	cmpb	r5,r12,r0
284	cmpb	r10,r11,r0
285	or	r6,r5,r10
286	cmpdi	cr7,r6,0
287	beq	cr7,L(vector1)
288
289	/* OK, one (or both) of the doublewords contains a null byte.  Check
290	   the first doubleword and decrement the address in case the first
291	   doubleword really contains a null byte.  */
292
293	cmpdi	cr6,r5,0
294	addi	r8,r8,-8
295	bne	cr6,L(done_null)
296
297	/* The null byte must be in the second doubleword.  Adjust the address
298	   again and move the result of cmpb to r10 so we can calculate the
299	   pointer.  */
300
301	mr	r5,r10
302	addi	r8,r8,8
303
304	/* r5 has the output of the cmpb instruction, that is, it contains
305	   0xff in the same position as the null byte in the original
306	   doubleword from the string.  Use that to calculate the pointer.  */
307L(done_null):
308#ifdef __LITTLE_ENDIAN__
309	addi    r0,r5,-1
310	andc    r0,r0,r5
311	popcntd	r0,r0
312#else
313	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
314#endif
315	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
316	add	r3,r8,r0      /* Return address of the matching null byte.  */
317	blr
318	.p2align  5
319L(vector1):
320	addi    r3, r8, 8
321	andi.	r10, r3, 31
322	bne	cr0, L(loop_null)
323	vspltisb	v8, -1
324	vspltisb	v0, 0
325	vspltisb	v10, 3
326	lvsl	v11, r0, r0
327	vslb	v10, v11, v10
328	li	r5, 16
329L(continue1):
330	lvx	v4, 0, r3
331	lvx	v5, r3, r5
332	vcmpequb	v2, v0, v4
333	vcmpequb	v3, v0, v5
334	vor	v8, v2, v3
335	vcmpequb.	v11, v0, v8
336	addi	r3, r3, 32
337	blt	cr6, L(continue1)
338	addi	r3, r3, -32
339L(end1):
340	vbpermq	v2, v2, v10
341	vbpermq	v3, v3, v10
342	/* Shift each component into its correct position for merging.  */
343#ifdef __LITTLE_ENDIAN__
344	vsldoi	v3, v3, v3, 2
345#else
346	vsldoi	v2, v2, v2, 6
347	vsldoi	v3, v3, v3, 4
348#endif
349
350        /* Merge the results and move to a GPR.  */
351        vor     v4, v3, v2
352	mfvrd	r5, v4
353#ifdef __LITTLE_ENDIAN__
354	addi	r6, r5, -1
355	andc	r6, r6, r5
356	popcntd	r6, r6
357#else
358	cntlzd	r6, r5	/* Count leading zeros before the match.  */
359#endif
360	add	r3, r3, r6	/* Compute final length.  */
361	blr
362END (FUNC_NAME)
363
364#ifndef USE_AS_STRCHRNUL
365weak_alias (strchr, index)
366libc_hidden_builtin_def (strchr)
367#endif
368