1/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
2   loop.
3   Copyright (C) 2016-2021 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22/* int [r3] strlen (char *s [r3])  */
23
24#ifndef STRLEN
25# define STRLEN strlen
26#endif
27	.machine  power8
28ENTRY_TOCLESS (STRLEN, 4)
29	CALL_MCOUNT 1
30	dcbt	0,r3
31	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
32	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
33	li	r0,0	      /* Doubleword with null chars to use
34				 with cmpb.  */
35	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
36	ld	r12,0(r4)     /* Load doubleword from memory.  */
37#ifdef __LITTLE_ENDIAN__
38	sld	r5,r5,r6
39#else
40	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
41#endif
42	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
43	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
44	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
45	bne	cr7,L(done)
46
47	/* For shorter strings (< 64 bytes), we will not use vector registers,
48	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
49	   will be done the same way as we do in the POWER7 implementation.
50	   Let's see if we are aligned to a quadword boundary.  If so, we can
51	   jump to the first (non-vectorized) loop.  Otherwise, we have to
52	   handle the next DWORD first.  */
53	mtcrf	0x01,r4
54	mr	r9,r4
55	addi	r9,r9,8
56	bt	28,L(align64)
57
58	/* Handle the next 8 bytes so we are aligned to a quadword
59	   boundary.  */
60	ldu	r5,8(r4)
61	cmpb	r10,r5,r0
62	cmpdi	cr7,r10,0
63	addi	r9,r9,8
64	bne	cr7,L(done)
65
66L(align64):
67	/* Proceed to the old (POWER7) implementation, checking two doublewords
68	   per iteraction.  For the first 56 bytes, we will just check for null
69	   characters.  After that, we will also check if we are 64-byte aligned
70	   so we can jump to the vectorized implementation.  We will unroll
71	   these loops to avoid excessive branching.  */
72	ld	r6,8(r4)
73	ldu	r5,16(r4)
74	cmpb	r10,r6,r0
75	cmpb	r11,r5,r0
76	or	r5,r10,r11
77	cmpdi	cr7,r5,0
78	addi	r9,r9,16
79	bne	cr7,L(dword_zero)
80
81	ld	r6,8(r4)
82	ldu	r5,16(r4)
83	cmpb	r10,r6,r0
84	cmpb	r11,r5,r0
85	or	r5,r10,r11
86	cmpdi	cr7,r5,0
87	addi	r9,r9,16
88	bne	cr7,L(dword_zero)
89
90	ld	r6,8(r4)
91	ldu	r5,16(r4)
92	cmpb	r10,r6,r0
93	cmpb	r11,r5,r0
94	or	r5,r10,r11
95	cmpdi	cr7,r5,0
96	addi	r9,r9,16
97	bne	cr7,L(dword_zero)
98
99	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
100	   Note: aligning to 64-byte will necessarily slow down performance for
101	   strings around 64 bytes in length due to the extra comparisons
102	   required to check alignment for the vectorized loop.  This is a
103	   necessary tradeoff we are willing to take in order to speed up the
104	   calculation for larger strings.  */
105	andi.	r10,r9,63
106	beq	cr0,L(preloop)
107	ld	r6,8(r4)
108	ldu	r5,16(r4)
109	cmpb	r10,r6,r0
110	cmpb	r11,r5,r0
111	or	r5,r10,r11
112	cmpdi	cr7,r5,0
113	addi	r9,r9,16
114	bne	cr7,L(dword_zero)
115
116	andi.	r10,r9,63
117	beq	cr0,L(preloop)
118	ld	r6,8(r4)
119	ldu	r5,16(r4)
120	cmpb	r10,r6,r0
121	cmpb	r11,r5,r0
122	or	r5,r10,r11
123	cmpdi	cr7,r5,0
124	addi	r9,r9,16
125	bne	cr7,L(dword_zero)
126
127	andi.	r10,r9,63
128	beq	cr0,L(preloop)
129	ld	r6,8(r4)
130	ldu	r5,16(r4)
131	cmpb	r10,r6,r0
132	cmpb	r11,r5,r0
133	or	r5,r10,r11
134	cmpdi	cr7,r5,0
135	addi	r9,r9,16
136
137	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
138	   found, jump to the vectorized loop.  */
139	beq	cr7,L(preloop)
140
141L(dword_zero):
142	/* OK, one (or both) of the doublewords contains a null byte.  Check
143	   the first doubleword and decrement the address in case the first
144	   doubleword really contains a null byte.  */
145
146	cmpdi	cr6,r10,0
147	addi	r4,r4,-8
148	bne	cr6,L(done)
149
150	/* The null byte must be in the second doubleword.  Adjust the address
151	   again and move the result of cmpb to r10 so we can calculate the
152	   length.  */
153
154	mr	r10,r11
155	addi	r4,r4,8
156
157	/* If the null byte was found in the non-vectorized code, compute the
158	   final length.  r10 has the output of the cmpb instruction, that is,
159	   it contains 0xff in the same position as the null byte in the
160	   original doubleword from the string.  Use that to calculate the
161	   length.  */
162L(done):
163#ifdef __LITTLE_ENDIAN__
164	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
165	andc	r9, r9,r10
166	popcntd	r0, r9	      /* Count the bits in the mask.  */
167#else
168	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
169#endif
170	subf	r5,r3,r4
171	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
172	add	r3,r5,r0      /* Compute final length.  */
173	blr
174
175	/* Vectorized implementation starts here.  */
176	.p2align  4
177L(preloop):
178	/* Set up for the loop.  */
179	mr	r4,r9
180	li	r7, 16	      /* Load required offsets.  */
181	li	r8, 32
182	li	r9, 48
183	li	r12, 8
184	vxor	v0,v0,v0      /* VR with null chars to use with
185				 vcmpequb.  */
186
187	/* Main loop to look for the end of the string.  We will read in
188	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
189	   leverage the icache performance.  */
190	.p2align  5
191L(loop):
192	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
193	lvx	  v2,r4,r7
194	lvx	  v3,r4,r8
195	lvx	  v4,r4,r9
196	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
197	vminub	  v6,v3,v4
198	vminub	  v7,v5,v6
199	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
200	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
201	bne	  cr6,L(vmx_zero)
202
203	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
204	lvx	  v2,r4,r7
205	lvx	  v3,r4,r8
206	lvx	  v4,r4,r9
207	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
208	vminub	  v6,v3,v4
209	vminub	  v7,v5,v6
210	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
211	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
212	bne	  cr6,L(vmx_zero)
213
214	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
215	lvx	  v2,r4,r7
216	lvx	  v3,r4,r8
217	lvx	  v4,r4,r9
218	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
219	vminub	  v6,v3,v4
220	vminub	  v7,v5,v6
221	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
222	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
223	beq	  cr6,L(loop)
224
225L(vmx_zero):
226	/* OK, we found a null byte.  Let's look for it in the current 64-byte
227	   block and mark it in its corresponding VR.  */
228	vcmpequb  v1,v1,v0
229	vcmpequb  v2,v2,v0
230	vcmpequb  v3,v3,v0
231	vcmpequb  v4,v4,v0
232
233	/* We will now 'compress' the result into a single doubleword, so it
234	   can be moved to a GPR for the final calculation.  First, we
235	   generate an appropriate mask for vbpermq, so we can permute bits into
236	   the first halfword.  */
237	vspltisb  v10,3
238	lvsl	  v11,r0,r0
239	vslb	  v10,v11,v10
240
241	/* Permute the first bit of each byte into bits 48-63.  */
242	vbpermq	v1,v1,v10
243	vbpermq	v2,v2,v10
244	vbpermq	v3,v3,v10
245	vbpermq	v4,v4,v10
246
247	/* Shift each component into its correct position for merging.  */
248#ifdef __LITTLE_ENDIAN__
249	vsldoi  v2,v2,v2,2
250	vsldoi  v3,v3,v3,4
251	vsldoi  v4,v4,v4,6
252#else
253	vsldoi	v1,v1,v1,6
254	vsldoi	v2,v2,v2,4
255	vsldoi	v3,v3,v3,2
256#endif
257
258	/* Merge the results and move to a GPR.  */
259	vor	v1,v2,v1
260	vor	v2,v3,v4
261	vor	v4,v1,v2
262	mfvrd	r10,v4
263
264	 /* Adjust address to the begninning of the current 64-byte block.  */
265	addi	r4,r4,-64
266
267#ifdef __LITTLE_ENDIAN__
268	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
269	andc	r9, r9,r10
270	popcntd	r0, r9	      /* Count the bits in the mask.  */
271#else
272	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
273#endif
274	subf	r5,r3,r4
275	add	r3,r5,r0      /* Compute final length.  */
276	blr
277
278END (STRLEN)
279libc_hidden_builtin_def (strlen)
280