1/* memset/bzero with unaligned store and rep stosb
2   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* memset is implemented as:
20   1. Use overlapping store to avoid branch.
21   2. If size is less than VEC, use integer register stores.
22   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25      4 VEC stores and store 4 * VEC at a time until done.  */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef XMM0
38# define XMM0				xmm0
39#endif
40
41#ifndef YMM0
42# define YMM0				ymm0
43#endif
44
45#ifndef VZEROUPPER
46# if VEC_SIZE > 16
47#  define VZEROUPPER			vzeroupper
48#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
49# else
50#  define VZEROUPPER
51# endif
52#endif
53
54#ifndef VZEROUPPER_SHORT_RETURN
55# define VZEROUPPER_SHORT_RETURN	rep; ret
56#endif
57
58#ifndef MOVQ
59# if VEC_SIZE > 16
60#  define MOVQ				vmovq
61# else
62#  define MOVQ				movq
63# endif
64#endif
65
66#if VEC_SIZE == 64
67# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
68#else
69# define LOOP_4X_OFFSET	(0)
70#endif
71
72#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
73# define END_REG	rcx
74# define LOOP_REG	rdi
75#else
76# define END_REG	rdi
77# define LOOP_REG	rdx
78#endif
79
80#define PAGE_SIZE 4096
81
82/* Macro to calculate size of small memset block for aligning
83   purposes.  */
84#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
85
86
87#ifndef SECTION
88# error SECTION is not defined!
89#endif
90
91	.section SECTION(.text),"ax",@progbits
92#if VEC_SIZE == 16 && IS_IN (libc)
93ENTRY (__bzero)
94	mov	%RDI_LP, %RAX_LP /* Set return value.  */
95	mov	%RSI_LP, %RDX_LP /* Set n.  */
96	xorl	%esi, %esi
97	pxor	%XMM0, %XMM0
98	jmp	L(entry_from_bzero)
99END (__bzero)
100weak_alias (__bzero, bzero)
101#endif
102
103#if IS_IN (libc)
104# if defined SHARED
105ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
106	cmp	%RDX_LP, %RCX_LP
107	jb	HIDDEN_JUMPTARGET (__chk_fail)
108END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
109# endif
110
111ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
112	shl	$2, %RDX_LP
113	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
114	jmp	L(entry_from_bzero)
115END (WMEMSET_SYMBOL (__wmemset, unaligned))
116#endif
117
118#if defined SHARED && IS_IN (libc)
119ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
120	cmp	%RDX_LP, %RCX_LP
121	jb	HIDDEN_JUMPTARGET (__chk_fail)
122END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
123#endif
124
125ENTRY (MEMSET_SYMBOL (__memset, unaligned))
126	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
127# ifdef __ILP32__
128	/* Clear the upper 32 bits.  */
129	mov	%edx, %edx
130# endif
131L(entry_from_bzero):
132	cmpq	$VEC_SIZE, %rdx
133	jb	L(less_vec)
134	cmpq	$(VEC_SIZE * 2), %rdx
135	ja	L(more_2x_vec)
136	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
137	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
138	VMOVU	%VEC(0), (%rdi)
139	VZEROUPPER_RETURN
140#if defined USE_MULTIARCH && IS_IN (libc)
141END (MEMSET_SYMBOL (__memset, unaligned))
142
143# if VEC_SIZE == 16
144ENTRY (__memset_chk_erms)
145	cmp	%RDX_LP, %RCX_LP
146	jb	HIDDEN_JUMPTARGET (__chk_fail)
147END (__memset_chk_erms)
148
149/* Only used to measure performance of REP STOSB.  */
150ENTRY (__memset_erms)
151	/* Skip zero length.  */
152	test	%RDX_LP, %RDX_LP
153	jnz	 L(stosb)
154	movq	%rdi, %rax
155	ret
156# else
157/* Provide a hidden symbol to debugger.  */
158	.hidden	MEMSET_SYMBOL (__memset, erms)
159ENTRY (MEMSET_SYMBOL (__memset, erms))
160# endif
161L(stosb):
162	mov	%RDX_LP, %RCX_LP
163	movzbl	%sil, %eax
164	mov	%RDI_LP, %RDX_LP
165	rep stosb
166	mov	%RDX_LP, %RAX_LP
167	VZEROUPPER_RETURN
168# if VEC_SIZE == 16
169END (__memset_erms)
170# else
171END (MEMSET_SYMBOL (__memset, erms))
172# endif
173
174# if defined SHARED && IS_IN (libc)
175ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
176	cmp	%RDX_LP, %RCX_LP
177	jb	HIDDEN_JUMPTARGET (__chk_fail)
178END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
179# endif
180
181ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
182	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
183# ifdef __ILP32__
184	/* Clear the upper 32 bits.  */
185	mov	%edx, %edx
186# endif
187	cmp	$VEC_SIZE, %RDX_LP
188	jb	L(less_vec)
189	cmp	$(VEC_SIZE * 2), %RDX_LP
190	ja	L(stosb_more_2x_vec)
191	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
192	 */
193	VMOVU	%VEC(0), (%rax)
194	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
195	VZEROUPPER_RETURN
196#endif
197
198	.p2align 4,, 10
199L(last_2x_vec):
200#ifdef USE_LESS_VEC_MASK_STORE
201	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
202	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
203#else
204	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
205	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
206#endif
207	VZEROUPPER_RETURN
208
209	/* If have AVX512 mask instructions put L(less_vec) close to
210	   entry as it doesn't take much space and is likely a hot target.
211	 */
212#ifdef USE_LESS_VEC_MASK_STORE
213	.p2align 4,, 10
214L(less_vec):
215	/* Less than 1 VEC.  */
216# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
217#  error Unsupported VEC_SIZE!
218# endif
219	/* Clear high bits from edi. Only keeping bits relevant to page
220	   cross check. Note that we are using rax which is set in
221	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
222	andl	$(PAGE_SIZE - 1), %edi
223	/* Check if VEC_SIZE store cross page. Mask stores suffer
224	   serious performance degradation when it has to fault supress.
225	 */
226	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
227	/* This is generally considered a cold target.  */
228	ja	L(cross_page)
229# if VEC_SIZE > 32
230	movq	$-1, %rcx
231	bzhiq	%rdx, %rcx, %rcx
232	kmovq	%rcx, %k1
233# else
234	movl	$-1, %ecx
235	bzhil	%edx, %ecx, %ecx
236	kmovd	%ecx, %k1
237# endif
238	vmovdqu8 %VEC(0), (%rax){%k1}
239	VZEROUPPER_RETURN
240
241# if defined USE_MULTIARCH && IS_IN (libc)
242	/* Include L(stosb_local) here if including L(less_vec) between
243	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
244	   L(stosb_more_2x_vec) target.  */
245	.p2align 4,, 10
246L(stosb_local):
247	movzbl	%sil, %eax
248	mov	%RDX_LP, %RCX_LP
249	mov	%RDI_LP, %RDX_LP
250	rep	stosb
251	mov	%RDX_LP, %RAX_LP
252	VZEROUPPER_RETURN
253# endif
254#endif
255
256#if defined USE_MULTIARCH && IS_IN (libc)
257	.p2align 4
258L(stosb_more_2x_vec):
259	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
260	ja	L(stosb_local)
261#endif
262	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
263	   and (4x, 8x] jump to target.  */
264L(more_2x_vec):
265
266	/* Two different methods of setting up pointers / compare. The
267	   two methods are based on the fact that EVEX/AVX512 mov
268	   instructions take more bytes then AVX2/SSE2 mov instructions. As
269	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
270	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
271	   this saves code size and keeps a few targets in one fetch block.
272	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
273#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
274	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
275	   LOOP_4X_OFFSET) with LEA_BID.  */
276
277	/* END_REG is rcx for EVEX/AVX512.  */
278	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
279#endif
280
281	/* Stores to first 2x VEC before cmp as any path forward will
282	   require it.  */
283	VMOVU	%VEC(0), (%rax)
284	VMOVU	%VEC(0), VEC_SIZE(%rax)
285
286
287#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
288	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
289	addq	%rdx, %END_REG
290#endif
291
292	cmpq	$(VEC_SIZE * 4), %rdx
293	jbe	L(last_2x_vec)
294
295	/* Store next 2x vec regardless.  */
296	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
297	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
298
299
300#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
301	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
302	   extra offset to addresses in loop. Used for AVX512 to save space
303	   as no way to get (VEC_SIZE * 4) in imm8.  */
304# if LOOP_4X_OFFSET == 0
305	subq	$-(VEC_SIZE * 4), %LOOP_REG
306# endif
307	/* Avoid imm32 compare here to save code size.  */
308	cmpq	%rdi, %rcx
309#else
310	addq	$-(VEC_SIZE * 4), %END_REG
311	cmpq	$(VEC_SIZE * 8), %rdx
312#endif
313	jbe	L(last_4x_vec)
314#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
315	/* Set LOOP_REG (rdx).  */
316	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
317#endif
318	/* Align dst for loop.  */
319	andq	$(VEC_SIZE * -2), %LOOP_REG
320	.p2align 4
321L(loop):
322	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
323	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
324	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
325	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
326	subq	$-(VEC_SIZE * 4), %LOOP_REG
327	cmpq	%END_REG, %LOOP_REG
328	jb	L(loop)
329	.p2align 4,, MOV_SIZE
330L(last_4x_vec):
331	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
332	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
333	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
334	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
335L(return):
336#if VEC_SIZE > 16
337	ZERO_UPPER_VEC_REGISTERS_RETURN
338#else
339	ret
340#endif
341
342	.p2align 4,, 10
343#ifndef USE_LESS_VEC_MASK_STORE
344# if defined USE_MULTIARCH && IS_IN (libc)
345	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
346	   range for 2-byte jump encoding.  */
347L(stosb_local):
348	movzbl	%sil, %eax
349	mov	%RDX_LP, %RCX_LP
350	mov	%RDI_LP, %RDX_LP
351	rep	stosb
352	mov	%RDX_LP, %RAX_LP
353	VZEROUPPER_RETURN
354# endif
355	/* Define L(less_vec) only if not otherwise defined.  */
356	.p2align 4
357L(less_vec):
358#endif
359L(cross_page):
360#if VEC_SIZE > 32
361	cmpl	$32, %edx
362	jae	L(between_32_63)
363#endif
364#if VEC_SIZE > 16
365	cmpl	$16, %edx
366	jae	L(between_16_31)
367#endif
368	MOVQ	%XMM0, %rdi
369	cmpl	$8, %edx
370	jae	L(between_8_15)
371	cmpl	$4, %edx
372	jae	L(between_4_7)
373	cmpl	$1, %edx
374	ja	L(between_2_3)
375	jb	L(return)
376	movb	%sil, (%rax)
377	VZEROUPPER_RETURN
378
379	/* Align small targets only if not doing so would cross a fetch
380	   line.  */
381#if VEC_SIZE > 32
382	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
383	/* From 32 to 63.  No branch when size == 32.  */
384L(between_32_63):
385	VMOVU	%YMM0, (%rax)
386	VMOVU	%YMM0, -32(%rax, %rdx)
387	VZEROUPPER_RETURN
388#endif
389
390#if VEC_SIZE >= 32
391	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
392L(between_16_31):
393	/* From 16 to 31.  No branch when size == 16.  */
394	VMOVU	%XMM0, (%rax)
395	VMOVU	%XMM0, -16(%rax, %rdx)
396	VZEROUPPER_RETURN
397#endif
398
399	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
400L(between_8_15):
401	/* From 8 to 15.  No branch when size == 8.  */
402	movq	%rdi, (%rax)
403	movq	%rdi, -8(%rax, %rdx)
404	VZEROUPPER_RETURN
405
406	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
407L(between_4_7):
408	/* From 4 to 7.  No branch when size == 4.  */
409	movl	%edi, (%rax)
410	movl	%edi, -4(%rax, %rdx)
411	VZEROUPPER_RETURN
412
413	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
414L(between_2_3):
415	/* From 2 to 3.  No branch when size == 2.  */
416	movw	%di, (%rax)
417	movb	%dil, -1(%rax, %rdx)
418	VZEROUPPER_RETURN
419END (MEMSET_SYMBOL (__memset, unaligned_erms))
420