1/* A Thunderx2 Optimized memcpy implementation for AARCH64.
2   Copyright (C) 2018-2021 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, unaligned accesses.
25 *
26 */
27
28#define dstin	x0
29#define src	x1
30#define count	x2
31#define dst	x3
32#define srcend	x4
33#define dstend	x5
34#define tmp2	x6
35#define tmp3	x7
36#define tmp3w   w7
37#define A_l	x6
38#define A_lw	w6
39#define A_h	x7
40#define A_hw	w7
41#define B_l	x8
42#define B_lw	w8
43#define B_h	x9
44#define C_l	x10
45#define C_h	x11
46#define D_l	x12
47#define D_h	x13
48#define E_l	src
49#define E_h	count
50#define F_l	srcend
51#define F_h	dst
52#define G_l	count
53#define G_h	dst
54#define tmp1	x14
55
56#define A_q	q0
57#define B_q	q1
58#define C_q	q2
59#define D_q	q3
60#define E_q	q4
61#define F_q	q5
62#define G_q	q6
63#define H_q	q7
64#define I_q	q16
65#define J_q	q17
66
67#define A_v	v0
68#define B_v	v1
69#define C_v	v2
70#define D_v	v3
71#define E_v	v4
72#define F_v	v5
73#define G_v	v6
74#define H_v	v7
75#define I_v	v16
76#define J_v	v17
77
78#ifndef MEMMOVE
79# define MEMMOVE memmove
80#endif
81#ifndef MEMCPY
82# define MEMCPY memcpy
83#endif
84
85#if IS_IN (libc)
86
87#undef MEMCPY
88#define MEMCPY __memcpy_thunderx2
89#undef MEMMOVE
90#define MEMMOVE __memmove_thunderx2
91
92
93/* Overlapping large forward memmoves use a loop that copies backwards.
94   Otherwise memcpy is used. Small moves branch to memcopy16 directly.
95   The longer memcpy cases fall through to the memcpy head.
96*/
97
98ENTRY_ALIGN (MEMMOVE, 6)
99
100	PTR_ARG (0)
101	PTR_ARG (1)
102	SIZE_ARG (2)
103
104	add	srcend, src, count
105	cmp	count, 16
106	b.ls	L(memcopy16)
107	sub	tmp1, dstin, src
108	cmp	count, 96
109	ccmp	tmp1, count, 2, hi
110	b.lo	L(move_long)
111
112END (MEMMOVE)
113libc_hidden_builtin_def (MEMMOVE)
114
115
116/* Copies are split into 3 main cases: small copies of up to 16 bytes,
117   medium copies of 17..96 bytes which are fully unrolled. Large copies
118   of more than 96 bytes align the destination and use load-and-merge
119   approach in the case src and dst addresses are unaligned not evenly,
120   so that, actual loads and stores are always aligned.
121   Large copies use the loops processing 64 bytes per iteration for
122   unaligned case and 128 bytes per iteration for aligned ones.
123*/
124
125#define MEMCPY_PREFETCH_LDR 640
126
127	.p2align 4
128ENTRY (MEMCPY)
129
130	PTR_ARG (0)
131	PTR_ARG (1)
132	SIZE_ARG (2)
133
134	add	srcend, src, count
135	cmp	count, 16
136	b.ls	L(memcopy16)
137	ldr	A_q, [src], #16
138	add	dstend, dstin, count
139	and	tmp1, src, 15
140	cmp	count, 96
141	b.hi	L(memcopy_long)
142
143	/* Medium copies: 17..96 bytes.  */
144	ldr	E_q, [srcend, -16]
145	cmp	count, 64
146	b.gt	L(memcpy_copy96)
147	cmp	count, 48
148	b.le	L(bytes_17_to_48)
149	/* 49..64 bytes */
150	ldp	B_q, C_q, [src]
151	str	E_q, [dstend, -16]
152	stp	A_q, B_q, [dstin]
153	str	C_q, [dstin, 32]
154	ret
155
156L(bytes_17_to_48):
157	/* 17..48 bytes*/
158	cmp	count, 32
159	b.gt	L(bytes_32_to_48)
160	/* 17..32 bytes*/
161	str	A_q, [dstin]
162	str	E_q, [dstend, -16]
163	ret
164
165L(bytes_32_to_48):
166	/* 32..48 */
167	ldr	B_q, [src]
168	str	A_q, [dstin]
169	str	E_q, [dstend, -16]
170	str	B_q, [dstin, 16]
171	ret
172
173	.p2align 4
174	/* Small copies: 0..16 bytes.  */
175L(memcopy16):
176	cmp	count, 8
177	b.lo	L(bytes_0_to_8)
178	ldr	A_l, [src]
179	ldr	A_h, [srcend, -8]
180	add	dstend, dstin, count
181	str	A_l, [dstin]
182	str	A_h, [dstend, -8]
183	ret
184	.p2align 4
185
186L(bytes_0_to_8):
187	tbz	count, 2, L(bytes_0_to_3)
188	ldr	A_lw, [src]
189	ldr	A_hw, [srcend, -4]
190	add	dstend, dstin, count
191	str	A_lw, [dstin]
192	str	A_hw, [dstend, -4]
193	ret
194
195	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
196	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
197L(bytes_0_to_3):
198	cbz	count, 1f
199	lsr	tmp1, count, 1
200	ldrb	A_lw, [src]
201	ldrb	A_hw, [srcend, -1]
202	add	dstend, dstin, count
203	ldrb	B_lw, [src, tmp1]
204	strb	B_lw, [dstin, tmp1]
205	strb	A_hw, [dstend, -1]
206	strb	A_lw, [dstin]
2071:
208	ret
209
210	.p2align 4
211
212L(memcpy_copy96):
213	/* Copying 65..96 bytes. A_q (first 16 bytes) and
214	   E_q(last 16 bytes) are already loaded. The size
215	   is large enough to benefit from aligned loads */
216	bic	src, src, 15
217	ldp	B_q, C_q, [src]
218	/* Loaded 64 bytes, second 16-bytes chunk can be
219	   overlapping with the first chunk by tmp1 bytes.
220	   Stored 16 bytes. */
221	sub	dst, dstin, tmp1
222	add	count, count, tmp1
223	/* The range of count being [65..96] becomes [65..111]
224	   after tmp [0..15] gets added to it,
225	   count now is <bytes-left-to-load>+48 */
226	cmp	count, 80
227	b.gt	L(copy96_medium)
228	ldr	D_q, [src, 32]
229	stp	B_q, C_q, [dst, 16]
230	str	D_q, [dst, 48]
231	str	A_q, [dstin]
232	str	E_q, [dstend, -16]
233	ret
234
235	.p2align 4
236L(copy96_medium):
237	ldp	D_q, G_q, [src, 32]
238	cmp	count, 96
239	b.gt	L(copy96_large)
240	stp	B_q, C_q, [dst, 16]
241	stp	D_q, G_q, [dst, 48]
242	str	A_q, [dstin]
243	str	E_q, [dstend, -16]
244	ret
245
246L(copy96_large):
247	ldr	F_q, [src, 64]
248	str	B_q, [dst, 16]
249	stp	C_q, D_q, [dst, 32]
250	stp	G_q, F_q, [dst, 64]
251	str	A_q, [dstin]
252	str	E_q, [dstend, -16]
253	ret
254
255	.p2align 4
256L(memcopy_long):
257	bic	src, src, 15
258	ldp	B_q, C_q, [src], #32
259	sub	dst, dstin, tmp1
260	add	count, count, tmp1
261	add	dst, dst, 16
262	and	tmp1, dst, 15
263	ldp	D_q, E_q, [src], #32
264	str	A_q, [dstin]
265
266	/* Already loaded 64+16 bytes. Check if at
267	   least 64 more bytes left */
268	subs	count, count, 64+64+16
269	b.lt	L(loop128_exit0)
270	cmp	count, MEMCPY_PREFETCH_LDR + 64 + 32
271	b.lt	L(loop128)
272	cbnz	tmp1, L(dst_unaligned)
273	sub	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
274
275	.p2align 4
276
277L(loop128_prefetch):
278	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
279	ldp	F_q, G_q, [src], #32
280	stp	B_q, C_q, [dst], #32
281	ldp	H_q, I_q, [src], #32
282	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
283	ldp	B_q, C_q, [src], #32
284	stp	D_q, E_q, [dst], #32
285	ldp	D_q, E_q, [src], #32
286	stp	F_q, G_q, [dst], #32
287	stp	H_q, I_q, [dst], #32
288	subs	count, count, 128
289	b.ge	L(loop128_prefetch)
290
291	add	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
292	.p2align 4
293L(loop128):
294	ldp	F_q, G_q, [src], #32
295	ldp	H_q, I_q, [src], #32
296	stp	B_q, C_q, [dst], #32
297	stp	D_q, E_q, [dst], #32
298	subs	count, count, 64
299	b.lt	L(loop128_exit1)
300	ldp	B_q, C_q, [src], #32
301	ldp	D_q, E_q, [src], #32
302	stp	F_q, G_q, [dst], #32
303	stp	H_q, I_q, [dst], #32
304	subs	count, count, 64
305	b.ge	L(loop128)
306L(loop128_exit0):
307	ldp	F_q, G_q, [srcend, -64]
308	ldp	H_q, I_q, [srcend, -32]
309	stp	B_q, C_q, [dst], #32
310	stp	D_q, E_q, [dst]
311	stp	F_q, G_q, [dstend, -64]
312	stp	H_q, I_q, [dstend, -32]
313	ret
314L(loop128_exit1):
315	ldp	B_q, C_q, [srcend, -64]
316	ldp	D_q, E_q, [srcend, -32]
317	stp	F_q, G_q, [dst], #32
318	stp	H_q, I_q, [dst]
319	stp	B_q, C_q, [dstend, -64]
320	stp	D_q, E_q, [dstend, -32]
321	ret
322
323L(dst_unaligned_tail):
324	ldp	C_q, D_q, [srcend, -64]
325	ldp	E_q, F_q, [srcend, -32]
326	stp	A_q, B_q, [dst], #32
327	stp	H_q, I_q, [dst], #16
328	str	G_q, [dst, tmp1]
329	stp	C_q, D_q, [dstend, -64]
330	stp	E_q, F_q, [dstend, -32]
331	ret
332
333L(dst_unaligned):
334	/* For the unaligned store case the code loads two
335	   aligned chunks and then merges them using ext
336	   instruction. This can be up to 30% faster than
337	   the the simple unaligned store access.
338
339	   Current state: tmp1 = dst % 16; C_q, D_q, E_q
340	   contains data yet to be stored. src and dst points
341	   to next-to-be-processed data. A_q, B_q contains
342	   data already stored before, count = bytes left to
343	   be load decremented by 64.
344
345	   The control is passed here if at least 64 bytes left
346	   to be loaded. The code does two aligned loads and then
347	   extracts (16-tmp1) bytes from the first register and
348	   tmp1 bytes from the next register forming the value
349	   for the aligned store.
350
351	   As ext instruction can only have it's index encoded
352	   as immediate. 15 code chunks process each possible
353	   index value. Computed goto is used to reach the
354	   required code. */
355
356	/* Store the 16 bytes to dst and align dst for further
357	   operations, several bytes will be stored at this
358	   address once more */
359
360	ldp	F_q, G_q, [src], #32
361	stp	B_q, C_q, [dst], #32
362	bic	dst, dst, 15
363	sub	count, count, 32
364	adrp	tmp2, L(ext_table)
365	add	tmp2, tmp2, :lo12:L(ext_table)
366	add	tmp2, tmp2, tmp1, LSL #2
367	ldr	tmp3w, [tmp2]
368	add	tmp2, tmp2, tmp3w, SXTW
369	br	tmp2
370
371.p2align 4
372	/* to make the loop in each chunk 16-bytes aligned */
373	nop
374#define EXT_CHUNK(shft) \
375L(ext_size_ ## shft):;\
376	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
377	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
378	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
3791:;\
380	stp     A_q, B_q, [dst], #32;\
381	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
382	ldp     C_q, D_q, [src], #32;\
383	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
384	stp     H_q, I_q, [dst], #32;\
385	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
386	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
387	ldp     F_q, G_q, [src], #32;\
388	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
389	subs    count, count, 64;\
390	b.ge    1b;\
3912:;\
392	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
393	b	L(dst_unaligned_tail);
394
395EXT_CHUNK(1)
396EXT_CHUNK(2)
397EXT_CHUNK(3)
398EXT_CHUNK(4)
399EXT_CHUNK(5)
400EXT_CHUNK(6)
401EXT_CHUNK(7)
402EXT_CHUNK(8)
403EXT_CHUNK(9)
404EXT_CHUNK(10)
405EXT_CHUNK(11)
406EXT_CHUNK(12)
407EXT_CHUNK(13)
408EXT_CHUNK(14)
409EXT_CHUNK(15)
410
411L(move_long):
412	.p2align 4
4131:
414	cbz	tmp1, 3f
415
416	add	srcend, src, count
417	add	dstend, dstin, count
418
419	and	tmp1, srcend, 15
420	ldr	D_q, [srcend, -16]
421	sub	srcend, srcend, tmp1
422	sub	count, count, tmp1
423	ldp	A_q, B_q, [srcend, -32]
424	str	D_q, [dstend, -16]
425	ldp	C_q, D_q, [srcend, -64]!
426	sub	dstend, dstend, tmp1
427	subs	count, count, 128
428	b.ls	2f
429
430	.p2align 4
4311:
432	subs	count, count, 64
433	stp	A_q, B_q, [dstend, -32]
434	ldp	A_q, B_q, [srcend, -32]
435	stp	C_q, D_q, [dstend, -64]!
436	ldp	C_q, D_q, [srcend, -64]!
437	b.hi	1b
438
439	/* Write the last full set of 64 bytes.  The remainder is at most 64
440	   bytes, so it is safe to always copy 64 bytes from the start even if
441	   there is just 1 byte left.  */
4422:
443	ldp	E_q, F_q, [src, 32]
444	ldp	G_q, H_q, [src]
445	stp	A_q, B_q, [dstend, -32]
446	stp	C_q, D_q, [dstend, -64]
447	stp	E_q, F_q, [dstin, 32]
448	stp	G_q, H_q, [dstin]
4493:	ret
450
451
452END (MEMCPY)
453	.section	.rodata
454	.p2align	4
455
456L(ext_table):
457	/* The first entry is for the alignment of 0 and is never
458	   actually used (could be any value).  */
459	.word	0
460	.word	L(ext_size_1) -.
461	.word	L(ext_size_2) -.
462	.word	L(ext_size_3) -.
463	.word	L(ext_size_4) -.
464	.word	L(ext_size_5) -.
465	.word	L(ext_size_6) -.
466	.word	L(ext_size_7) -.
467	.word	L(ext_size_8) -.
468	.word	L(ext_size_9) -.
469	.word	L(ext_size_10) -.
470	.word	L(ext_size_11) -.
471	.word	L(ext_size_12) -.
472	.word	L(ext_size_13) -.
473	.word	L(ext_size_14) -.
474	.word	L(ext_size_15) -.
475
476libc_hidden_builtin_def (MEMCPY)
477#endif
478