1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#if IS_IN (libc)
22
23# include "asm-syntax.h"
24
25	.section .text.avx512,"ax",@progbits
26ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
27	cmp	%RDX_LP, %RCX_LP
28	jb	HIDDEN_JUMPTARGET (__chk_fail)
29END (__mempcpy_chk_avx512_no_vzeroupper)
30
31ENTRY (__mempcpy_avx512_no_vzeroupper)
32	mov	%RDI_LP, %RAX_LP
33	add	%RDX_LP, %RAX_LP
34	jmp	L(start)
35END (__mempcpy_avx512_no_vzeroupper)
36
37ENTRY (__memmove_chk_avx512_no_vzeroupper)
38	cmp	%RDX_LP, %RCX_LP
39	jb	HIDDEN_JUMPTARGET (__chk_fail)
40END (__memmove_chk_avx512_no_vzeroupper)
41
42ENTRY (__memmove_avx512_no_vzeroupper)
43	mov	%RDI_LP, %RAX_LP
44# ifdef USE_AS_MEMPCPY
45	add	%RDX_LP, %RAX_LP
46# endif
47L(start):
48# ifdef __ILP32__
49	/* Clear the upper 32 bits.  */
50	mov	%edx, %edx
51# endif
52	lea	(%rsi, %rdx), %rcx
53	lea	(%rdi, %rdx), %r9
54	cmp	$512, %rdx
55	ja	L(512bytesormore)
56
57L(check):
58	cmp	$16, %rdx
59	jbe	L(less_16bytes)
60	cmp	$256, %rdx
61	jb	L(less_256bytes)
62	vmovups	(%rsi), %zmm0
63	vmovups 0x40(%rsi), %zmm1
64	vmovups 0x80(%rsi), %zmm2
65	vmovups 0xC0(%rsi), %zmm3
66	vmovups	-0x100(%rcx), %zmm4
67	vmovups -0xC0(%rcx), %zmm5
68	vmovups -0x80(%rcx), %zmm6
69	vmovups -0x40(%rcx), %zmm7
70	vmovups %zmm0, (%rdi)
71	vmovups %zmm1, 0x40(%rdi)
72	vmovups %zmm2, 0x80(%rdi)
73	vmovups %zmm3, 0xC0(%rdi)
74	vmovups	%zmm4, -0x100(%r9)
75	vmovups %zmm5, -0xC0(%r9)
76	vmovups %zmm6, -0x80(%r9)
77	vmovups %zmm7, -0x40(%r9)
78	ret
79
80L(less_256bytes):
81	cmp	$128, %dl
82	jb	L(less_128bytes)
83	vmovups	(%rsi), %zmm0
84	vmovups 0x40(%rsi), %zmm1
85	vmovups -0x80(%rcx), %zmm2
86	vmovups -0x40(%rcx), %zmm3
87	vmovups	%zmm0, (%rdi)
88	vmovups %zmm1, 0x40(%rdi)
89	vmovups %zmm2, -0x80(%r9)
90	vmovups %zmm3, -0x40(%r9)
91	ret
92
93L(less_128bytes):
94	cmp	$64, %dl
95	jb	L(less_64bytes)
96	vmovdqu (%rsi), %ymm0
97	vmovdqu 0x20(%rsi), %ymm1
98	vmovdqu -0x40(%rcx), %ymm2
99	vmovdqu -0x20(%rcx), %ymm3
100	vmovdqu %ymm0, (%rdi)
101	vmovdqu %ymm1, 0x20(%rdi)
102	vmovdqu %ymm2, -0x40(%r9)
103	vmovdqu %ymm3, -0x20(%r9)
104	ret
105
106L(less_64bytes):
107	cmp	$32, %dl
108	jb	L(less_32bytes)
109	vmovdqu	(%rsi), %ymm0
110	vmovdqu -0x20(%rcx), %ymm1
111	vmovdqu	%ymm0, (%rdi)
112	vmovdqu	%ymm1, -0x20(%r9)
113	ret
114
115L(less_32bytes):
116	vmovdqu (%rsi), %xmm0
117	vmovdqu -0x10(%rcx), %xmm1
118	vmovdqu %xmm0, (%rdi)
119	vmovdqu %xmm1, -0x10(%r9)
120	ret
121
122L(less_16bytes):
123	cmp	$8, %dl
124	jb	L(less_8bytes)
125	movq	(%rsi), %rsi
126	movq	-0x8(%rcx), %rcx
127	movq	%rsi, (%rdi)
128	movq	%rcx, -0x8(%r9)
129	ret
130
131L(less_8bytes):
132	cmp	$4, %dl
133	jb	L(less_4bytes)
134	mov	(%rsi), %esi
135	mov	-0x4(%rcx), %ecx
136	mov	%esi, (%rdi)
137	mov	%ecx, -0x4(%r9)
138	ret
139
140L(less_4bytes):
141	cmp	$2, %dl
142	jb	L(less_2bytes)
143	mov	(%rsi), %si
144	mov	-0x2(%rcx), %cx
145	mov	%si, (%rdi)
146	mov	%cx, -0x2(%r9)
147	ret
148
149L(less_2bytes):
150	cmp	$1, %dl
151	jb	L(less_1bytes)
152	mov	(%rsi), %cl
153	mov	%cl, (%rdi)
154L(less_1bytes):
155	ret
156
157L(512bytesormore):
158# ifdef SHARED_CACHE_SIZE_HALF
159	mov	$SHARED_CACHE_SIZE_HALF, %r8
160# else
161	mov	__x86_shared_cache_size_half(%rip), %r8
162# endif
163	cmp	%r8, %rdx
164	jae	L(preloop_large)
165	cmp	$1024, %rdx
166	ja	L(1024bytesormore)
167	prefetcht1 (%rsi)
168	prefetcht1 0x40(%rsi)
169	prefetcht1 0x80(%rsi)
170	prefetcht1 0xC0(%rsi)
171	prefetcht1 0x100(%rsi)
172	prefetcht1 0x140(%rsi)
173	prefetcht1 0x180(%rsi)
174	prefetcht1 0x1C0(%rsi)
175	prefetcht1 -0x200(%rcx)
176	prefetcht1 -0x1C0(%rcx)
177	prefetcht1 -0x180(%rcx)
178	prefetcht1 -0x140(%rcx)
179	prefetcht1 -0x100(%rcx)
180	prefetcht1 -0xC0(%rcx)
181	prefetcht1 -0x80(%rcx)
182	prefetcht1 -0x40(%rcx)
183	vmovups	(%rsi), %zmm0
184	vmovups 0x40(%rsi), %zmm1
185	vmovups 0x80(%rsi), %zmm2
186	vmovups 0xC0(%rsi), %zmm3
187	vmovups	0x100(%rsi), %zmm4
188	vmovups 0x140(%rsi), %zmm5
189	vmovups 0x180(%rsi), %zmm6
190	vmovups 0x1C0(%rsi), %zmm7
191	vmovups	-0x200(%rcx), %zmm8
192	vmovups -0x1C0(%rcx), %zmm9
193	vmovups -0x180(%rcx), %zmm10
194	vmovups -0x140(%rcx), %zmm11
195	vmovups	-0x100(%rcx), %zmm12
196	vmovups -0xC0(%rcx), %zmm13
197	vmovups -0x80(%rcx), %zmm14
198	vmovups -0x40(%rcx), %zmm15
199	vmovups %zmm0, (%rdi)
200	vmovups %zmm1, 0x40(%rdi)
201	vmovups %zmm2, 0x80(%rdi)
202	vmovups %zmm3, 0xC0(%rdi)
203	vmovups %zmm4, 0x100(%rdi)
204	vmovups %zmm5, 0x140(%rdi)
205	vmovups %zmm6, 0x180(%rdi)
206	vmovups %zmm7, 0x1C0(%rdi)
207	vmovups	%zmm8, -0x200(%r9)
208	vmovups %zmm9, -0x1C0(%r9)
209	vmovups %zmm10, -0x180(%r9)
210	vmovups %zmm11, -0x140(%r9)
211	vmovups	%zmm12, -0x100(%r9)
212	vmovups %zmm13, -0xC0(%r9)
213	vmovups %zmm14, -0x80(%r9)
214	vmovups %zmm15, -0x40(%r9)
215	ret
216
217L(1024bytesormore):
218	cmp	%rsi, %rdi
219	ja	L(1024bytesormore_bkw)
220	sub	$512, %r9
221	vmovups -0x200(%rcx), %zmm8
222	vmovups -0x1C0(%rcx), %zmm9
223	vmovups -0x180(%rcx), %zmm10
224	vmovups -0x140(%rcx), %zmm11
225	vmovups	-0x100(%rcx), %zmm12
226	vmovups -0xC0(%rcx), %zmm13
227	vmovups -0x80(%rcx), %zmm14
228	vmovups -0x40(%rcx), %zmm15
229	prefetcht1 (%rsi)
230	prefetcht1 0x40(%rsi)
231	prefetcht1 0x80(%rsi)
232	prefetcht1 0xC0(%rsi)
233	prefetcht1 0x100(%rsi)
234	prefetcht1 0x140(%rsi)
235	prefetcht1 0x180(%rsi)
236	prefetcht1 0x1C0(%rsi)
237
238/* Loop with unaligned memory access.  */
239L(gobble_512bytes_loop):
240	vmovups	(%rsi), %zmm0
241	vmovups 0x40(%rsi), %zmm1
242	vmovups 0x80(%rsi), %zmm2
243	vmovups 0xC0(%rsi), %zmm3
244	vmovups	0x100(%rsi), %zmm4
245	vmovups 0x140(%rsi), %zmm5
246	vmovups 0x180(%rsi), %zmm6
247	vmovups 0x1C0(%rsi), %zmm7
248	add	$512, %rsi
249	prefetcht1 (%rsi)
250	prefetcht1 0x40(%rsi)
251	prefetcht1 0x80(%rsi)
252	prefetcht1 0xC0(%rsi)
253	prefetcht1 0x100(%rsi)
254	prefetcht1 0x140(%rsi)
255	prefetcht1 0x180(%rsi)
256	prefetcht1 0x1C0(%rsi)
257	vmovups	%zmm0, (%rdi)
258	vmovups %zmm1, 0x40(%rdi)
259	vmovups %zmm2, 0x80(%rdi)
260	vmovups %zmm3, 0xC0(%rdi)
261	vmovups	%zmm4, 0x100(%rdi)
262	vmovups %zmm5, 0x140(%rdi)
263	vmovups %zmm6, 0x180(%rdi)
264	vmovups %zmm7, 0x1C0(%rdi)
265	add	$512, %rdi
266	cmp	%r9, %rdi
267	jb	L(gobble_512bytes_loop)
268	vmovups %zmm8, (%r9)
269	vmovups %zmm9, 0x40(%r9)
270	vmovups %zmm10, 0x80(%r9)
271	vmovups %zmm11, 0xC0(%r9)
272	vmovups %zmm12, 0x100(%r9)
273	vmovups %zmm13, 0x140(%r9)
274	vmovups %zmm14, 0x180(%r9)
275	vmovups %zmm15, 0x1C0(%r9)
276	ret
277
278L(1024bytesormore_bkw):
279	add	$512, %rdi
280	vmovups	0x1C0(%rsi), %zmm8
281	vmovups 0x180(%rsi), %zmm9
282	vmovups 0x140(%rsi), %zmm10
283	vmovups 0x100(%rsi), %zmm11
284	vmovups	0xC0(%rsi), %zmm12
285	vmovups 0x80(%rsi), %zmm13
286	vmovups 0x40(%rsi), %zmm14
287	vmovups (%rsi), %zmm15
288	prefetcht1 -0x40(%rcx)
289	prefetcht1 -0x80(%rcx)
290	prefetcht1 -0xC0(%rcx)
291	prefetcht1 -0x100(%rcx)
292	prefetcht1 -0x140(%rcx)
293	prefetcht1 -0x180(%rcx)
294	prefetcht1 -0x1C0(%rcx)
295	prefetcht1 -0x200(%rcx)
296
297/* Backward loop with unaligned memory access.  */
298L(gobble_512bytes_loop_bkw):
299	vmovups -0x40(%rcx), %zmm0
300	vmovups -0x80(%rcx), %zmm1
301	vmovups -0xC0(%rcx), %zmm2
302	vmovups	-0x100(%rcx), %zmm3
303	vmovups -0x140(%rcx), %zmm4
304	vmovups -0x180(%rcx), %zmm5
305	vmovups -0x1C0(%rcx), %zmm6
306	vmovups	-0x200(%rcx), %zmm7
307	sub	$512, %rcx
308	prefetcht1 -0x40(%rcx)
309	prefetcht1 -0x80(%rcx)
310	prefetcht1 -0xC0(%rcx)
311	prefetcht1 -0x100(%rcx)
312	prefetcht1 -0x140(%rcx)
313	prefetcht1 -0x180(%rcx)
314	prefetcht1 -0x1C0(%rcx)
315	prefetcht1 -0x200(%rcx)
316	vmovups %zmm0, -0x40(%r9)
317	vmovups %zmm1, -0x80(%r9)
318	vmovups %zmm2, -0xC0(%r9)
319	vmovups	%zmm3, -0x100(%r9)
320	vmovups %zmm4, -0x140(%r9)
321	vmovups %zmm5, -0x180(%r9)
322	vmovups %zmm6, -0x1C0(%r9)
323	vmovups	%zmm7, -0x200(%r9)
324	sub	$512, %r9
325	cmp	%rdi, %r9
326	ja	L(gobble_512bytes_loop_bkw)
327	vmovups %zmm8, -0x40(%rdi)
328	vmovups %zmm9, -0x80(%rdi)
329	vmovups %zmm10, -0xC0(%rdi)
330	vmovups %zmm11, -0x100(%rdi)
331	vmovups %zmm12, -0x140(%rdi)
332	vmovups %zmm13, -0x180(%rdi)
333	vmovups %zmm14, -0x1C0(%rdi)
334	vmovups %zmm15, -0x200(%rdi)
335	ret
336
337L(preloop_large):
338	cmp	%rsi, %rdi
339	ja	L(preloop_large_bkw)
340	vmovups	(%rsi), %zmm4
341	vmovups	0x40(%rsi), %zmm5
342
343	mov	%rdi, %r11
344/* Align destination for access with non-temporal stores in the loop.  */
345	mov	%rdi, %r8
346	and	$-0x80, %rdi
347	add	$0x80, %rdi
348	sub	%rdi, %r8
349	sub	%r8, %rsi
350	add	%r8, %rdx
351L(gobble_256bytes_nt_loop):
352	prefetcht1 0x200(%rsi)
353	prefetcht1 0x240(%rsi)
354	prefetcht1 0x280(%rsi)
355	prefetcht1 0x2C0(%rsi)
356	prefetcht1 0x300(%rsi)
357	prefetcht1 0x340(%rsi)
358	prefetcht1 0x380(%rsi)
359	prefetcht1 0x3C0(%rsi)
360	vmovdqu64 (%rsi), %zmm0
361	vmovdqu64 0x40(%rsi), %zmm1
362	vmovdqu64 0x80(%rsi), %zmm2
363	vmovdqu64 0xC0(%rsi), %zmm3
364	vmovntdq %zmm0, (%rdi)
365	vmovntdq %zmm1, 0x40(%rdi)
366	vmovntdq %zmm2, 0x80(%rdi)
367	vmovntdq %zmm3, 0xC0(%rdi)
368	sub	$256, %rdx
369	add	$256, %rsi
370	add	$256, %rdi
371	cmp	$256, %rdx
372	ja	L(gobble_256bytes_nt_loop)
373	sfence
374	vmovups	%zmm4, (%r11)
375	vmovups	%zmm5, 0x40(%r11)
376	jmp	L(check)
377
378L(preloop_large_bkw):
379	vmovups -0x80(%rcx), %zmm4
380	vmovups -0x40(%rcx), %zmm5
381
382/* Align end of destination for access with non-temporal stores.  */
383	mov	%r9, %r8
384	and	$-0x80, %r9
385	sub	%r9, %r8
386	sub	%r8, %rcx
387	sub	%r8, %rdx
388	add	%r9, %r8
389L(gobble_256bytes_nt_loop_bkw):
390	prefetcht1 -0x400(%rcx)
391	prefetcht1 -0x3C0(%rcx)
392	prefetcht1 -0x380(%rcx)
393	prefetcht1 -0x340(%rcx)
394	prefetcht1 -0x300(%rcx)
395	prefetcht1 -0x2C0(%rcx)
396	prefetcht1 -0x280(%rcx)
397	prefetcht1 -0x240(%rcx)
398	vmovdqu64 -0x100(%rcx), %zmm0
399	vmovdqu64 -0xC0(%rcx), %zmm1
400	vmovdqu64 -0x80(%rcx), %zmm2
401	vmovdqu64 -0x40(%rcx), %zmm3
402	vmovntdq %zmm0,	-0x100(%r9)
403	vmovntdq %zmm1,	-0xC0(%r9)
404	vmovntdq %zmm2,	-0x80(%r9)
405	vmovntdq %zmm3,	-0x40(%r9)
406	sub	$256, %rdx
407	sub	$256, %rcx
408	sub	$256, %r9
409	cmp	$256, %rdx
410	ja	L(gobble_256bytes_nt_loop_bkw)
411	sfence
412	vmovups	%zmm4, -0x80(%r8)
413	vmovups	%zmm5, -0x40(%r8)
414	jmp	L(check)
415END (__memmove_avx512_no_vzeroupper)
416
417strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
418strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
419#endif
420