1/* Optimized strncpy implementation for POWER9 LE.
2   Copyright (C) 2020-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STPNCPY
22# ifndef STPNCPY
23#   define FUNC_NAME __stpncpy
24# else
25#   define FUNC_NAME STPNCPY
26# endif
27#else
28# ifndef STRNCPY
29#  define FUNC_NAME strncpy
30# else
31#  define FUNC_NAME STRNCPY
32# endif
33#endif  /* !USE_AS_STPNCPY  */
34
35#ifndef MEMSET
36/* For builds without IFUNC support, local calls should be made to internal
37   GLIBC symbol (created by libc_hidden_builtin_def).  */
38# ifdef SHARED
39#  define MEMSET_is_local
40#  define MEMSET   __GI_memset
41# else
42#  define MEMSET   memset
43# endif
44#endif
45
46#define FRAMESIZE (FRAME_MIN_SIZE+8)
47
48/* Implements the function
49
50   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
51
52   or
53
54   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
55
56   if USE_AS_STPNCPY is defined.
57
58   The implementation can load bytes past a null terminator, but only
59   up to the next 16-byte aligned address, so it never crosses a page.  */
60
61.machine power9
62#ifdef MEMSET_is_local
63ENTRY_TOCLESS (FUNC_NAME, 4)
64#else
65ENTRY (FUNC_NAME, 4)
66#endif
67	CALL_MCOUNT 2
68
69	/* NULL string optimizations  */
70	cmpdi   r5, 0
71	beqlr
72
73	lbz	r0,0(r4)
74	stb	r0,0(r3)
75	addi	r11,r3,1
76	addi	r5,r5,-1
77	vspltisb v18,0		/* Zeroes in v18  */
78	cmpdi	r0,0
79	beq	L(zero_padding)
80
81	/* Empty/1-byte string optimization  */
82	cmpdi	r5,0
83#ifdef USE_AS_STPNCPY
84	bgt	L(cont)
85	/* Compute pointer to last byte copied into dest.  */
86	addi	r3,r3,1
87	blr
88L(cont):
89#else
90	beqlr
91#endif
92
93	addi	r4,r4,1
94	neg	r7,r4
95	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
96
97	/* Get source 16B aligned  */
98	lvx	v0,0,r4
99	lvsr	v1,0,r4
100	vperm	v0,v18,v0,v1
101
102	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
103	vctzlsbb r7,v6		/* Number of trailing zeroes  */
104	addi	r8,r7,1		/* Add null terminator  */
105
106	/* r8 = bytes including null
107	   r9 = bytes to get source 16B aligned
108	   if r8 > r9
109	      no null, copy r9 bytes
110	   else
111	      there is a null, copy r8 bytes and return.  */
112	cmpld	r8,r9
113	bgt	L(no_null)
114
115	cmpld	cr6,r8,r5	/* r8 <= n?  */
116	ble	cr6,L(null)
117
118	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
119	stxvl	32+v0,r11,r10	/* Partial store  */
120
121#ifdef USE_AS_STPNCPY
122	/* Compute pointer to last byte copied into dest.  */
123	add	r3,r11,r5
124#endif
125	blr
126
127L(null):
128	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
129	stxvl	32+v0,r11,r10	/* Partial store  */
130
131#ifdef USE_AS_STPNCPY
132	/* Compute pointer to last byte copied into dest.  */
133	add	r3,r11,r7
134#endif
135	add	r11,r11,r8
136	sub	r5,r5,r8
137	b L(zero_padding)
138
139L(no_null):
140	cmpld	r9,r5		/* Check if length was reached.  */
141	bge	L(n_tail1)
142
143	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
144	stxvl	32+v0,r11,r10	/* Partial store  */
145
146	add	r4,r4,r9
147	add	r11,r11,r9
148	sub	r5,r5,r9
149
150L(loop):
151	cmpldi	cr6,r5,64	/* Check if length was reached.  */
152	ble	cr6,L(final_loop)
153
154	lxv	32+v0,0(r4)
155	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
156	bne	cr6,L(prep_tail1)
157
158	lxv	32+v1,16(r4)
159	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
160	bne	cr6,L(prep_tail2)
161
162	lxv	32+v2,32(r4)
163	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
164	bne	cr6,L(prep_tail3)
165
166	lxv	32+v3,48(r4)
167	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
168	bne	cr6,L(prep_tail4)
169
170	stxv	32+v0,0(r11)
171	stxv	32+v1,16(r11)
172	stxv	32+v2,32(r11)
173	stxv	32+v3,48(r11)
174
175	addi	r4,r4,64
176	addi	r11,r11,64
177	addi	r5,r5,-64
178
179	b	L(loop)
180
181L(final_loop):
182	cmpldi	cr5,r5,16
183	lxv	32+v0,0(r4)
184	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
185	ble	cr5,L(prep_n_tail1)
186	bne	cr6,L(count_tail1)
187	addi	r5,r5,-16
188
189	cmpldi	cr5,r5,16
190	lxv	32+v1,16(r4)
191	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
192	ble	cr5,L(prep_n_tail2)
193	bne	cr6,L(count_tail2)
194	addi	r5,r5,-16
195
196	cmpldi	cr5,r5,16
197	lxv	32+v2,32(r4)
198	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
199	ble	cr5,L(prep_n_tail3)
200	bne	cr6,L(count_tail3)
201	addi	r5,r5,-16
202
203	lxv	32+v3,48(r4)
204	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
205	beq	cr6,L(n_tail4)
206
207	vctzlsbb r8,v6		/* Number of trailing zeroes  */
208	cmpld	r8,r5		/* r8 < n?  */
209	blt	L(tail4)
210
211L(n_tail4):
212	stxv	32+v0,0(r11)
213	stxv	32+v1,16(r11)
214	stxv	32+v2,32(r11)
215	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
216	addi	r11,r11,48	/* Offset */
217	stxvl	32+v3,r11,r10	/* Partial store  */
218#ifdef USE_AS_STPNCPY
219	/* Compute pointer to last byte copied into dest.  */
220	add	r3,r11,r5
221#endif
222	blr
223
224L(prep_n_tail1):
225	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
226	vctzlsbb r8,v6		/* Number of trailing zeroes  */
227	cmpld	r8,r5		/* r8 < n?  */
228	blt	L(tail1)
229
230L(n_tail1):
231	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
232	stxvl	32+v0,r11,r10	/* Partial store  */
233#ifdef USE_AS_STPNCPY
234	/* Compute pointer to last byte copied into dest.  */
235	add	r3,r11,r5
236#endif
237	blr
238
239L(prep_n_tail2):
240	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
241	vctzlsbb r8,v6		/* Number of trailing zeroes  */
242	cmpld	r8,r5		/* r8 < n?  */
243	blt	L(tail2)
244
245L(n_tail2):
246	stxv	32+v0,0(r11)
247	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
248	addi	r11,r11,16	/* offset */
249	stxvl	32+v1,r11,r10	/* Partial store  */
250#ifdef USE_AS_STPNCPY
251	/* Compute pointer to last byte copied into dest.  */
252	add	r3,r11,r5
253#endif
254	blr
255
256L(prep_n_tail3):
257	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
258	vctzlsbb r8,v6		/* Number of trailing zeroes  */
259	cmpld	r8,r5		/* r8 < n?  */
260	blt	L(tail3)
261
262L(n_tail3):
263	stxv	32+v0,0(r11)
264	stxv	32+v1,16(r11)
265	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
266	addi	r11,r11,32	/* Offset */
267	stxvl	32+v2,r11,r10	/* Partial store  */
268#ifdef USE_AS_STPNCPY
269	/* Compute pointer to last byte copied into dest.  */
270	add	r3,r11,r5
271#endif
272	blr
273
274L(prep_tail1):
275L(count_tail1):
276	vctzlsbb r8,v6		/* Number of trailing zeroes  */
277L(tail1):
278	addi	r9,r8,1		/* Add null terminator  */
279	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
280	stxvl	32+v0,r11,r10	/* Partial store  */
281#ifdef USE_AS_STPNCPY
282	/* Compute pointer to last byte copied into dest.  */
283	add	r3,r11,r8
284#endif
285	add	r11,r11,r9
286	sub	r5,r5,r9
287	b L(zero_padding)
288
289L(prep_tail2):
290	addi	r5,r5,-16
291L(count_tail2):
292	vctzlsbb r8,v6		/* Number of trailing zeroes  */
293L(tail2):
294	addi	r9,r8,1		/* Add null terminator  */
295	stxv	32+v0,0(r11)
296	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
297	addi	r11,r11,16	/* offset */
298	stxvl	32+v1,r11,r10	/* Partial store  */
299#ifdef USE_AS_STPNCPY
300	/* Compute pointer to last byte copied into dest.  */
301	add	r3,r11,r8
302#endif
303	add	r11,r11,r9
304	sub	r5,r5,r9
305	b L(zero_padding)
306
307L(prep_tail3):
308	addi	r5,r5,-32
309L(count_tail3):
310	vctzlsbb r8,v6		/* Number of trailing zeroes  */
311L(tail3):
312	addi	r9,r8,1		/* Add null terminator  */
313	stxv	32+v0,0(r11)
314	stxv	32+v1,16(r11)
315	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
316	addi	r11,r11,32	/* offset */
317	stxvl	32+v2,r11,r10	/* Partial store  */
318#ifdef USE_AS_STPNCPY
319	/* Compute pointer to last byte copied into dest.  */
320	add	r3,r11,r8
321#endif
322	add	r11,r11,r9
323	sub	r5,r5,r9
324	b L(zero_padding)
325
326L(prep_tail4):
327	addi	r5,r5,-48
328	vctzlsbb r8,v6		/* Number of trailing zeroes  */
329L(tail4):
330	addi	r9,r8,1		/* Add null terminator  */
331	stxv	32+v0,0(r11)
332	stxv	32+v1,16(r11)
333	stxv	32+v2,32(r11)
334	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
335	addi	r11,r11,48	/* offset */
336	stxvl	32+v3,r11,r10	/* Partial store  */
337#ifdef USE_AS_STPNCPY
338	/* Compute pointer to last byte copied into dest.  */
339	add	r3,r11,r8
340#endif
341	add	r11,r11,r9
342	sub	r5,r5,r9
343
344/* This code pads the remainder of dest with NULL bytes.  For large numbers
345   memset gives a better performance, 255 was chosen through experimentation.
346   */
347L(zero_padding):
348	cmpldi	r5,255
349	bge	L(zero_padding_memset)
350
351L(zero_padding_loop):
352	cmpldi	cr6,r5,16	/* Check if length was reached.  */
353	ble	cr6,L(zero_padding_end)
354
355	stxv	v18,0(r11)
356	addi	r11,r11,16
357	addi	r5,r5,-16
358
359	b	L(zero_padding_loop)
360
361L(zero_padding_end):
362	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
363	stxvl	v18,r11,r10	/* Partial store  */
364	blr
365
366	.align	4
367L(zero_padding_memset):
368	std	r30,-8(r1)   /* Save r30 on the stack.  */
369	cfi_offset(r30, -8)
370	mr	r30,r3       /* Save the return value of strncpy.  */
371	/* Prepare the call to memset.  */
372	mr	r3,r11       /* Pointer to the area to be zero-filled.  */
373	li	r4,0         /* Byte to be written (zero).  */
374
375	/* We delayed the creation of the stack frame, as well as the saving of
376	   the link register, because only at this point, we are sure that
377	   doing so is actually needed.  */
378
379	/* Save the link register.  */
380	mflr	r0
381	std	r0,16(r1)
382
383	/* Create the stack frame.  */
384	stdu	r1,-FRAMESIZE(r1)
385	cfi_adjust_cfa_offset(FRAMESIZE)
386	cfi_offset(lr, 16)
387
388	bl	MEMSET
389#ifndef MEMSET_is_local
390	nop
391#endif
392
393	ld	r0,FRAMESIZE+16(r1)
394
395	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
396				dest.  For stpncpy, the return value is the
397				same as return value of memset.  */
398	ld	r30,FRAMESIZE-8(r1) /* Restore r30.  */
399	/* Restore the stack frame.  */
400	addi	r1,r1,FRAMESIZE
401	cfi_adjust_cfa_offset(-FRAMESIZE)
402	/* Restore the link register.  */
403	mtlr	r0
404	cfi_restore(lr)
405	blr
406
407END (FUNC_NAME)
408#ifndef USE_AS_STPNCPY
409libc_hidden_builtin_def (strncpy)
410#endif
411