1/* Optimized memchr implementation for POWER8.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5])  */
22
23#ifndef MEMCHR
24# define MEMCHR __memchr
25#endif
26	.machine  power8
27ENTRY_TOCLESS (MEMCHR)
28	CALL_MCOUNT 3
29	dcbt	0, r3
30	clrrdi  r8, r3, 3
31	insrdi	r4, r4, 8, 48
32
33	/* Calculate the last acceptable address and check for possible
34	   addition overflow by using satured math:
35	   r7 = r3 + r5
36	   r7 |= -(r7 < x)  */
37	add     r7, r3, r5
38	subfc   r6, r3, r7
39	subfe   r9, r9, r9
40	extsw   r6, r9
41	or      r7, r7, r6
42
43	insrdi	r4, r4, 16, 32
44	cmpldi	r5, 32
45	li	r9, -1
46	rlwinm	r6, r3, 3, 26, 28 /* Calculate padding.  */
47	insrdi  r4, r4, 32, 0
48	mr	r10, r7
49	addi	r7, r7, -1
50#ifdef __LITTLE_ENDIAN__
51	sld	r9, r9, r6
52#else
53	srd	r9, r9, r6
54#endif
55	ble	L(small_range)
56	andi.	r11, r3, 63
57	beq	cr0, L(align_qw)
58	clrldi	r11, r3, 61
59	ld	r12, 0(r8)     /* Load doubleword from memory.  */
60	cmpb	r3, r12, r4     /* Check for BYTEs in DWORD1.  */
61	and	r3, r3, r9
62	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
63	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
64	cmpldi	cr7, r3, 0      /* Does r3 indicate we got a hit?  */
65	bne	cr7, L(done)
66	addi	r8, r8, 8
67	addi	r5, r5, -8
68	add	r5, r5, r11
69
70	/* Are we now aligned to a quadword boundary?  */
71	andi.	r11, r8, 15
72	beq	cr0, L(align_qw)
73
74	/* Handle DWORD to make it QW aligned.  */
75	ld	r12, 0(r8)
76	cmpb	r3, r12, r4
77	cmpldi	cr7, r3, 0
78	bne	cr7, L(done)
79	addi	r5, r5, -8
80	addi	r8, r8, 8
81	/* At this point, r8 is 16B aligned.  */
82L(align_qw):
83	vspltisb	v0, 0
84	/* Precompute vbpermq constant.  */
85	vspltisb	v10, 3
86	li	r0, 0
87	lvsl	v11, r0, r0
88	vslb	v10, v11, v10
89	mtvrd	v1, r4
90	vspltb	v1, v1, 7
91	cmpldi	r5, 64
92	ble	L(tail64)
93	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
94	   Note: aligning to 64-byte will necessarily slow down performance for
95	   strings around 64 bytes in length due to the extra comparisons
96	   required to check alignment for the vectorized loop.  This is a
97	   necessary tradeoff we are willing to take in order to speed up the
98	   calculation for larger strings.  */
99	andi.	r11, r8, 63
100	beq	cr0, L(preloop_64B)
101	/* In order to begin the 64B loop, it needs to be 64
102	   bytes aligned.  So read until it is 64B aligned.  */
103	lvx	v4, 0, r8
104	vcmpequb	v6, v1, v4
105	vcmpequb.	v11, v0, v6
106	bnl	cr6, L(found_16B)
107	addi	r8, r8, 16
108	addi	r5, r5, -16
109
110	andi.	r11, r8, 63
111	beq	cr0, L(preloop_64B)
112	lvx	v4, 0, r8
113	vcmpequb	v6, v1, v4
114	vcmpequb.	v11, v0, v6
115	bnl	cr6, L(found_16B)
116	addi	r8, r8, 16
117	addi	r5, r5, -16
118
119	andi.	r11, r8, 63
120	beq	cr0, L(preloop_64B)
121	lvx	v4, 0, r8
122	vcmpequb	v6, v1, v4
123	vcmpequb.	v11, v0, v6
124	bnl	cr6, L(found_16B)
125	addi	r8, r8, 16
126	addi	r5, r5, -16
127	/* At this point it should be 64B aligned.
128	   Prepare for the 64B loop.  */
129L(preloop_64B):
130	cmpldi	r5, 64		/* Check if r5 < 64.  */
131	ble	L(tail64)
132	sub	r6, r10, r8
133	srdi	r9, r6, 6	/* Number of loop iterations.  */
134	mtctr	r9		/* Setup the counter.  */
135	li	r11, 16		/* Load required offsets.  */
136	li	r9, 32
137	li	r7, 48
138
139	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
140	.align 4
141L(loop):
142	lvx	v2, 0, r8	/* Load 4 quadwords.  */
143	lvx	v3, r8, r11
144	lvx	v4, v8, r9
145	lvx	v5, v8, r7
146	vcmpequb	v6, v1, v2
147	vcmpequb	v7, v1, v3
148	vcmpequb	v8, v1, v4
149	vcmpequb	v9, v1, v5
150	vor	v11, v6, v7
151	vor	v12, v8, v9
152	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
153	vcmpequb.	v11, v0, v11
154	bnl	cr6, L(found)
155	addi	r8, r8, 64	/* Adjust address for the next iteration.  */
156	bdnz	L(loop)
157	clrldi	r5, r6, 58
158
159	/* Handle remainder of 64B loop or r5 > 64.  */
160	.align	4
161L(tail64):
162	cmpldi	r5, 0
163	beq	L(null)
164	lvx	v4, 0, r8
165	vcmpequb	v6, v1, v4
166	vcmpequb.	v11, v0, v6
167	bnl	cr6, L(found_16B)
168	addi	r8, r8, 16
169	cmpldi	cr6, r5, 16
170	ble	cr6, L(null)
171	addi	r5, r5, -16
172
173	lvx	v4, 0, r8
174	vcmpequb	v6, v1, v4
175	vcmpequb.	v11, v0, v6
176	bnl	cr6, L(found_16B)
177	addi	r8, r8, 16
178	cmpldi	cr6, r5, 16
179	ble	cr6, L(null)
180	addi	r5, r5, -16
181
182	lvx	v4, 0, r8
183	vcmpequb	v6, v1, v4
184	vcmpequb.	v11, v0, v6
185	bnl	cr6, L(found_16B)
186	addi	r8, r8, 16
187	cmpldi	cr6, r5, 16
188	ble	cr6, L(null)
189	addi	r5, r5, -16
190
191	lvx	v4, 0, r8
192	vcmpequb	v6, v1, v4
193	vcmpequb.	v11, v0, v6
194	bnl	cr6, L(found_16B)
195	li	r3, 0
196	blr
197
198	/* Found a match in 64B loop.  */
199	.align	4
200L(found):
201	/* Permute the first bit of each byte into bits 48-63.  */
202	vbpermq	v6, v6, v10
203	vbpermq	v7, v7, v10
204	vbpermq	v8, v8, v10
205	vbpermq	v9, v9, v10
206	/* Shift each component into its correct position for merging.  */
207#ifdef __LITTLE_ENDIAN__
208	vsldoi	v7, v7, v7, 2
209	vsldoi	v8, v8, v8, 4
210	vsldoi	v9, v9, v9, 6
211#else
212	vsldoi	v6, v6, v6, 6
213	vsldoi	v7, v7, v7, 4
214	vsldoi	v8, v8, v8, 2
215#endif
216	/* Merge the results and move to a GPR.  */
217	vor	v11, v6, v7
218	vor	v4, v9, v8
219	vor	v4, v11, v4
220	mfvrd	r5, v4
221#ifdef __LITTLE_ENDIAN__
222	addi	r6, r5, -1
223	andc	r6, r6, r5
224	popcntd	r6, r6
225#else
226	cntlzd	r6, r5	/* Count leading zeros before the match.  */
227#endif
228	add	r3, r8, r6	/* Compute final length.  */
229	blr
230
231	/* Found a match in last 16 bytes.  */
232	.align	4
233L(found_16B):
234	/* Permute the first bit of each byte into bits 48-63.  */
235	vbpermq	v6, v6, v10
236	/* Shift each component into its correct position for merging.  */
237#ifdef __LITTLE_ENDIAN__
238	mfvrd	r7, v6
239	addi	r6, r7, -1
240	andc	r6, r6, r7
241	popcntd	r6, r6
242#else
243	vsldoi	v6, v6, v6, 6
244	mfvrd	r7, v6
245	cntlzd	r6, r7	/* Count leading zeros before the match.  */
246#endif
247	add	r3, r8, r6	/* Compute final length.  */
248	cmpld	r6, r5
249	bltlr
250	li	r3, 0
251	blr
252
253	.align	4
254	/* r3 has the output of the cmpb instruction, that is, it contains
255	   0xff in the same position as BYTE in the original
256	   doubleword from the string.  Use that to calculate the pointer.
257	   We need to make sure BYTE is *before* the end of the range.  */
258L(done):
259#ifdef __LITTLE_ENDIAN__
260	addi	r0, r3, -1
261	andc	r0, r0, r3
262	popcntd	r0, r0	      /* Count trailing zeros.  */
263#else
264	cntlzd	r0, r3	      /* Count leading zeros before the match.  */
265#endif
266	cmpld	r8, r7         /* Are we on the last dword?  */
267	srdi	r0, r0, 3	/* Convert leading/trailing zeros to bytes.  */
268	add	r3, r8, r0
269	cmpld	cr7, r0, r6     /* If on the last dword, check byte offset.  */
270	bnelr
271	blelr	cr7
272	li	r3, 0
273	blr
274
275	.align	4
276L(null):
277	li	r3, 0
278	blr
279
280/* Deals with size <= 32.  */
281	.align	4
282L(small_range):
283	cmpldi	r5, 0
284	beq	L(null)
285	ld	r12, 0(r8)     /* Load word from memory.  */
286	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
287	and	r3, r3, r9
288	cmpldi	cr7, r3, 0
289	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
290	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
291	cmpld	r8, r7         /* Are we done already?  */
292	bne	cr7, L(done)
293	beqlr
294
295	ldu	r12, 8(r8)
296	cmpb	r3, r12, r4
297	cmpldi	cr6, r3, 0
298	cmpld	r8, r7
299	bne	cr6, L(done)   /* Found something.  */
300	beqlr		      /* Hit end of string (length).  */
301
302	ldu	r12, 8(r8)
303	cmpb	r3, r12, r4
304	cmpldi	cr6, r3, 0
305	cmpld	r8, r7
306	bne	cr6, L(done)
307	beqlr
308
309	ldu	r12, 8(r8)
310	cmpb	r3, r12, r4
311	cmpldi	cr6, r3, 0
312	cmpld	r8, r7
313	bne	cr6, L(done)
314	beqlr
315
316	ldu	r12, 8(r8)
317	cmpb	r3, r12, r4
318	cmpldi	cr6, r3, 0
319	bne	cr6, L(done)
320	blr
321
322END (MEMCHR)
323weak_alias (__memchr, memchr)
324libc_hidden_builtin_def (memchr)
325