1/* strcpy/stpcpy - copy a string returning pointer to start/end.
2   Copyright (C) 2013-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
20
21   To test the page crossing code path more thoroughly, compile with
22   -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23   the slower entry path.  This option is not intended for production use.  */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, Advanced SIMD.
30 * MTE compatible.
31 */
32
33/* Arguments and results.  */
34#define dstin		x0
35#define srcin		x1
36#define result		x0
37
38#define src		x2
39#define dst		x3
40#define len		x4
41#define synd		x4
42#define	tmp		x5
43#define wtmp		w5
44#define shift		x5
45#define data1		x6
46#define dataw1		w6
47#define data2		x7
48#define dataw2		w7
49
50#define dataq		q0
51#define vdata		v0
52#define vhas_nul	v1
53#define vrepmask	v2
54#define vend		v3
55#define dend		d3
56#define dataq2		q1
57
58#ifdef BUILD_STPCPY
59# define STRCPY __stpcpy
60# define IFSTPCPY(X,...) X,__VA_ARGS__
61#else
62# define STRCPY strcpy
63# define IFSTPCPY(X,...)
64#endif
65
66/* Core algorithm:
67
68   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
69   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
70   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
71   set likewise for odd bytes so that adjacent bytes can be merged. Since the
72   bits in the syndrome reflect the order in which things occur in the original
73   string, counting trailing zeros identifies exactly which byte matched.  */
74
75ENTRY (STRCPY)
76	PTR_ARG (0)
77	PTR_ARG (1)
78	bic	src, srcin, 15
79	mov	wtmp, 0xf00f
80	ld1	{vdata.16b}, [src]
81	dup	vrepmask.8h, wtmp
82	cmeq	vhas_nul.16b, vdata.16b, 0
83	lsl	shift, srcin, 2
84	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
85	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
86	fmov	synd, dend
87	lsr	synd, synd, shift
88	cbnz	synd, L(tail)
89
90	ldr	dataq, [src, 16]!
91	cmeq	vhas_nul.16b, vdata.16b, 0
92	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
93	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
94	fmov	synd, dend
95	cbz	synd, L(start_loop)
96
97#ifndef __AARCH64EB__
98	rbit	synd, synd
99#endif
100	sub	tmp, src, srcin
101	clz	len, synd
102	add	len, tmp, len, lsr 2
103	tbz	len, 4, L(less16)
104	sub	tmp, len, 15
105	ldr	dataq, [srcin]
106	ldr	dataq2, [srcin, tmp]
107	str	dataq, [dstin]
108	str	dataq2, [dstin, tmp]
109	IFSTPCPY (add result, dstin, len)
110	ret
111
112	.p2align 4,,8
113L(tail):
114	rbit	synd, synd
115	clz	len, synd
116	lsr	len, len, 2
117
118	.p2align 4
119L(less16):
120	tbz	len, 3, L(less8)
121	sub	tmp, len, 7
122	ldr	data1, [srcin]
123	ldr	data2, [srcin, tmp]
124	str	data1, [dstin]
125	str	data2, [dstin, tmp]
126	IFSTPCPY (add result, dstin, len)
127	ret
128
129	.p2align 4
130L(less8):
131	subs	tmp, len, 3
132	b.lo	L(less4)
133	ldr	dataw1, [srcin]
134	ldr	dataw2, [srcin, tmp]
135	str	dataw1, [dstin]
136	str	dataw2, [dstin, tmp]
137	IFSTPCPY (add result, dstin, len)
138	ret
139
140L(less4):
141	cbz	len, L(zerobyte)
142	ldrh	dataw1, [srcin]
143	strh	dataw1, [dstin]
144L(zerobyte):
145	strb	wzr, [dstin, len]
146	IFSTPCPY (add result, dstin, len)
147	ret
148
149	.p2align 4
150L(start_loop):
151	sub	len, src, srcin
152	ldr	dataq2, [srcin]
153	add	dst, dstin, len
154	str	dataq2, [dstin]
155
156	.p2align 5
157L(loop):
158	str	dataq, [dst], 16
159	ldr	dataq, [src, 16]!
160	cmeq	vhas_nul.16b, vdata.16b, 0
161	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
162	fmov	synd, dend
163	cbz	synd, L(loop)
164
165	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
166	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
167	fmov	synd, dend
168#ifndef __AARCH64EB__
169	rbit	synd, synd
170#endif
171	clz	len, synd
172	lsr	len, len, 2
173	sub	tmp, len, 15
174	ldr	dataq, [src, tmp]
175	str	dataq, [dst, tmp]
176	IFSTPCPY (add result, dst, len)
177	ret
178
179END (STRCPY)
180
181#ifdef BUILD_STPCPY
182weak_alias (__stpcpy, stpcpy)
183libc_hidden_def (__stpcpy)
184libc_hidden_builtin_def (stpcpy)
185#else
186libc_hidden_builtin_def (strcpy)
187#endif
188