1/* Optimized memcpy for Qualcomm Falkor processor.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library.  If not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22/* Assumptions:
23
24   ARMv8-a, AArch64, falkor, unaligned accesses.  */
25
26#define dstin	x0
27#define src	x1
28#define count	x2
29#define dst	x3
30#define srcend	x4
31#define dstend	x5
32#define tmp1	x14
33#define A_x	x6
34#define B_x	x7
35#define A_w	w6
36#define B_w	w7
37
38#define A_q	q0
39#define B_q	q1
40#define C_q	q2
41#define D_q	q3
42#define E_q	q4
43#define F_q	q5
44#define G_q	q6
45#define H_q	q7
46#define Q_q	q6
47#define S_q	q22
48
49/* Copies are split into 3 main cases:
50
51   1. Small copies of up to 32 bytes
52   2. Medium copies of 33..128 bytes which are fully unrolled
53   3. Large copies of more than 128 bytes.
54
55   Large copies align the source to a quad word and use an unrolled loop
56   processing 64 bytes per iteration.
57
58   FALKOR-SPECIFIC DESIGN:
59
60   The smallest copies (32 bytes or less) focus on optimal pipeline usage,
61   which is why the redundant copies of 0-3 bytes have been replaced with
62   conditionals, since the former would unnecessarily break across multiple
63   issue groups.  The medium copy group has been enlarged to 128 bytes since
64   bumping up the small copies up to 32 bytes allows us to do that without
65   cost and also allows us to reduce the size of the prep code before loop64.
66
67   The copy loop uses only one register q0.  This is to ensure that all loads
68   hit a single hardware prefetcher which can get correctly trained to prefetch
69   a single stream.
70
71   The non-temporal stores help optimize cache utilization.  */
72
73#if IS_IN (libc)
74ENTRY_ALIGN (__memcpy_falkor, 6)
75
76	PTR_ARG (0)
77	PTR_ARG (1)
78	SIZE_ARG (2)
79
80	cmp	count, 32
81	add	srcend, src, count
82	add	dstend, dstin, count
83	b.ls	L(copy32)
84	cmp	count, 128
85	b.hi	L(copy_long)
86
87	/* Medium copies: 33..128 bytes.  */
88L(copy128):
89	sub	tmp1, count, 1
90	ldr	A_q, [src]
91	ldr	B_q, [src, 16]
92	ldr	C_q, [srcend, -32]
93	ldr	D_q, [srcend, -16]
94	tbz	tmp1, 6, 1f
95	ldr	E_q, [src, 32]
96	ldr	F_q, [src, 48]
97	ldr	G_q, [srcend, -64]
98	ldr	H_q, [srcend, -48]
99	str	G_q, [dstend, -64]
100	str	H_q, [dstend, -48]
101	str	E_q, [dstin, 32]
102	str	F_q, [dstin, 48]
1031:
104	str	A_q, [dstin]
105	str	B_q, [dstin, 16]
106	str	C_q, [dstend, -32]
107	str	D_q, [dstend, -16]
108	ret
109
110	.p2align 4
111	/* Small copies: 0..32 bytes.  */
112L(copy32):
113	/* 16-32 */
114	cmp	count, 16
115	b.lo	1f
116	ldr	A_q, [src]
117	ldr	B_q, [srcend, -16]
118	str	A_q, [dstin]
119	str	B_q, [dstend, -16]
120	ret
121	.p2align 4
1221:
123	/* 8-15 */
124	tbz	count, 3, 1f
125	ldr	A_x, [src]
126	ldr	B_x, [srcend, -8]
127	str	A_x, [dstin]
128	str	B_x, [dstend, -8]
129	ret
130	.p2align 4
1311:
132	/* 4-7 */
133	tbz	count, 2, 1f
134	ldr	A_w, [src]
135	ldr	B_w, [srcend, -4]
136	str	A_w, [dstin]
137	str	B_w, [dstend, -4]
138	ret
139	.p2align 4
1401:
141	/* 2-3 */
142	tbz	count, 1, 1f
143	ldrh	A_w, [src]
144	ldrh	B_w, [srcend, -2]
145	strh	A_w, [dstin]
146	strh	B_w, [dstend, -2]
147	ret
148	.p2align 4
1491:
150	/* 0-1 */
151	tbz	count, 0, 1f
152	ldrb	A_w, [src]
153	strb	A_w, [dstin]
1541:
155	ret
156
157	/* Align SRC to 16 bytes and copy; that way at least one of the
158	   accesses is aligned throughout the copy sequence.
159
160	   The count is off by 0 to 15 bytes, but this is OK because we trim
161	   off the last 64 bytes to copy off from the end.  Due to this the
162	   loop never runs out of bounds.  */
163
164	.p2align 4
165	nop		/* Align loop64 below.  */
166L(copy_long):
167	ldr	A_q, [src]
168	sub	count, count, 64 + 16
169	and	tmp1, src, 15
170	str	A_q, [dstin]
171	bic	src, src, 15
172	sub	dst, dstin, tmp1
173	add	count, count, tmp1
174
175L(loop64):
176	ldr	A_q, [src, 16]!
177	str	A_q, [dst, 16]
178	ldr	A_q, [src, 16]!
179	subs	count, count, 64
180	str	A_q, [dst, 32]
181	ldr	A_q, [src, 16]!
182	str	A_q, [dst, 48]
183	ldr	A_q, [src, 16]!
184	str	A_q, [dst, 64]!
185	b.hi	L(loop64)
186
187	/* Write the last full set of 64 bytes.  The remainder is at most 64
188	   bytes, so it is safe to always copy 64 bytes from the end even if
189	   there is just 1 byte left.  */
190	ldr	E_q, [srcend, -64]
191	str	E_q, [dstend, -64]
192	ldr	D_q, [srcend, -48]
193	str	D_q, [dstend, -48]
194	ldr	C_q, [srcend, -32]
195	str	C_q, [dstend, -32]
196	ldr	B_q, [srcend, -16]
197	str	B_q, [dstend, -16]
198	ret
199
200END (__memcpy_falkor)
201libc_hidden_builtin_def (__memcpy_falkor)
202
203
204/* RATIONALE:
205
206   The move has 4 distinct parts:
207   * Small moves of 32 bytes and under.
208   * Medium sized moves of 33-128 bytes (fully unrolled).
209   * Large moves where the source address is higher than the destination
210     (forward copies)
211   * Large moves where the destination address is higher than the source
212     (copy backward, or move).
213
214   We use only two registers q6 and q22 for the moves and move 32 bytes at a
215   time to correctly train the hardware prefetcher for better throughput.
216
217   For small and medium cases memcpy is used.  */
218
219ENTRY_ALIGN (__memmove_falkor, 6)
220
221	PTR_ARG (0)
222	PTR_ARG (1)
223	SIZE_ARG (2)
224
225	cmp	count, 32
226	add	srcend, src, count
227	add	dstend, dstin, count
228	b.ls	L(copy32)
229	cmp	count, 128
230	b.ls	L(copy128)
231	sub	tmp1, dstin, src
232	ccmp	tmp1, count, 2, hi
233	b.lo	L(move_long)
234
235	/* CASE: Copy Forwards
236
237	   Align src to 16 byte alignment so that we don't cross cache line
238	   boundaries on both loads and stores.  There are at least 128 bytes
239	   to copy, so copy 16 bytes unaligned and then align.  The loop
240	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
241
242	ldr	S_q, [src]
243	and	tmp1, src, 15
244	bic	src, src, 15
245	sub	dst, dstin, tmp1
246	add	count, count, tmp1	/* Count is now 16 too large.  */
247	ldr	Q_q, [src, 16]!
248	str	S_q, [dstin]
249	ldr	S_q, [src, 16]!
250	sub	count, count, 32 + 32 + 16	/* Test and readjust count.  */
251
252	.p2align 4
2531:
254	subs	count, count, 32
255	str	Q_q, [dst, 16]
256	ldr	Q_q, [src, 16]!
257	str	S_q, [dst, 32]!
258	ldr	S_q, [src, 16]!
259	b.hi	1b
260
261	/* Copy 32 bytes from the end before writing the data prefetched in the
262	   last loop iteration.  */
2632:
264	ldr	B_q, [srcend, -32]
265	ldr	C_q, [srcend, -16]
266	str	Q_q, [dst, 16]
267	str	S_q, [dst, 32]
268	str	B_q, [dstend, -32]
269	str	C_q, [dstend, -16]
270	ret
271
272	/* CASE: Copy Backwards
273
274	   Align srcend to 16 byte alignment so that we don't cross cache line
275	   boundaries on both loads and stores.  There are at least 128 bytes
276	   to copy, so copy 16 bytes unaligned and then align.  The loop
277	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
278
279	.p2align 4
280	nop
281	nop
282L(move_long):
283	cbz	tmp1, 3f  /* Return early if src == dstin */
284	ldr	S_q, [srcend, -16]
285	and	tmp1, srcend, 15
286	sub	srcend, srcend, tmp1
287	ldr	Q_q, [srcend, -16]!
288	str	S_q, [dstend, -16]
289	sub	count, count, tmp1
290	ldr	S_q, [srcend, -16]!
291	sub	dstend, dstend, tmp1
292	sub	count, count, 32 + 32
293
2941:
295	subs	count, count, 32
296	str	Q_q, [dstend, -16]
297	ldr	Q_q, [srcend, -16]!
298	str	S_q, [dstend, -32]!
299	ldr	S_q, [srcend, -16]!
300	b.hi	1b
301
302	/* Copy 32 bytes from the start before writing the data prefetched in the
303	   last loop iteration.  */
304
305	ldr	B_q, [src, 16]
306	ldr	C_q, [src]
307	str	Q_q, [dstend, -16]
308	str	S_q, [dstend, -32]
309	str	B_q, [dstin, 16]
310	str	C_q, [dstin]
3113:	ret
312
313END (__memmove_falkor)
314libc_hidden_builtin_def (__memmove_falkor)
315#endif
316