1/* Copyright (C) 2018-2021 Free Software Foundation, Inc.
2
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "memset-reg.h"
21
22#ifndef MEMSET
23# define MEMSET __memset_base64
24#endif
25
26/* To disable DC ZVA, set this threshold to 0. */
27#ifndef DC_ZVA_THRESHOLD
28# define DC_ZVA_THRESHOLD 512
29#endif
30
31/* Assumptions:
32 *
33 * ARMv8-a, AArch64, unaligned accesses
34 *
35 */
36
37ENTRY_ALIGN (MEMSET, 6)
38
39	PTR_ARG (0)
40	SIZE_ARG (2)
41
42	bfi	valw, valw, 8, 8
43	bfi	valw, valw, 16, 16
44	bfi	val, val, 32, 32
45
46	add	dstend, dstin, count
47
48	cmp	count, 96
49	b.hi	L(set_long)
50	cmp	count, 16
51	b.hs	L(set_medium)
52
53	/* Set 0..15 bytes.  */
54	tbz	count, 3, 1f
55	str	val, [dstin]
56	str	val, [dstend, -8]
57	ret
58
59	.p2align 3
601:	tbz	count, 2, 2f
61	str	valw, [dstin]
62	str	valw, [dstend, -4]
63	ret
642:	cbz	count, 3f
65	strb	valw, [dstin]
66	tbz	count, 1, 3f
67	strh	valw, [dstend, -2]
683:	ret
69
70	.p2align 3
71	/* Set 16..96 bytes.  */
72L(set_medium):
73	stp	val, val, [dstin]
74	tbnz	count, 6, L(set96)
75	stp	val, val, [dstend, -16]
76	tbz	count, 5, 1f
77	stp	val, val, [dstin, 16]
78	stp	val, val, [dstend, -32]
791:	ret
80
81	.p2align 4
82	/* Set 64..96 bytes.  Write 64 bytes from the start and
83	   32 bytes from the end.  */
84L(set96):
85	stp	val, val, [dstin, 16]
86	stp	val, val, [dstin, 32]
87	stp	val, val, [dstin, 48]
88	stp	val, val, [dstend, -32]
89	stp	val, val, [dstend, -16]
90	ret
91
92	.p2align 4
93L(set_long):
94	stp	val, val, [dstin]
95	bic	dst, dstin, 15
96#if DC_ZVA_THRESHOLD
97	cmp	count, DC_ZVA_THRESHOLD
98	ccmp	val, 0, 0, cs
99	b.eq	L(zva_64)
100#endif
101	/* Small-size or non-zero memset does not use DC ZVA. */
102	sub	count, dstend, dst
103
104	/*
105	 * Adjust count and bias for loop. By substracting extra 1 from count,
106	 * it is easy to use tbz instruction to check whether loop tailing
107	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
108	 */
109	sub	count, count, 64+16+1
110
111#if DC_ZVA_THRESHOLD
112	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
113	nop
114#endif
115
1161:	stp	val, val, [dst, 16]
117	stp	val, val, [dst, 32]
118	stp	val, val, [dst, 48]
119	stp	val, val, [dst, 64]!
120	subs	count, count, 64
121	b.hs	1b
122
123	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
124	stp	val, val, [dst, 16]
125	stp	val, val, [dst, 32]
1261:	stp	val, val, [dstend, -32]
127	stp	val, val, [dstend, -16]
128	ret
129
130#if DC_ZVA_THRESHOLD
131	.p2align 3
132L(zva_64):
133	stp	val, val, [dst, 16]
134	stp	val, val, [dst, 32]
135	stp	val, val, [dst, 48]
136	bic	dst, dst, 63
137
138	/*
139	 * Previous memory writes might cross cache line boundary, and cause
140	 * cache line partially dirty. Zeroing this kind of cache line using
141	 * DC ZVA will incur extra cost, for it requires loading untouched
142	 * part of the line from memory before zeoring.
143	 *
144	 * So, write the first 64 byte aligned block using stp to force
145	 * fully dirty cache line.
146	 */
147	stp	val, val, [dst, 64]
148	stp	val, val, [dst, 80]
149	stp	val, val, [dst, 96]
150	stp	val, val, [dst, 112]
151
152	sub	count, dstend, dst
153	/*
154	 * Adjust count and bias for loop. By substracting extra 1 from count,
155	 * it is easy to use tbz instruction to check whether loop tailing
156	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
157	 */
158	sub	count, count, 128+64+64+1
159	add	dst, dst, 128
160	nop
161
162	/* DC ZVA sets 64 bytes each time. */
1631:	dc	zva, dst
164	add	dst, dst, 64
165	subs	count, count, 64
166	b.hs	1b
167
168	/*
169	 * Write the last 64 byte aligned block using stp to force fully
170	 * dirty cache line.
171	 */
172	stp	val, val, [dst, 0]
173	stp	val, val, [dst, 16]
174	stp	val, val, [dst, 32]
175	stp	val, val, [dst, 48]
176
177	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
178	stp	val, val, [dst, 64]
179	stp	val, val, [dst, 80]
1801:	stp	val, val, [dstend, -32]
181	stp	val, val, [dstend, -16]
182	ret
183#endif
184
185END (MEMSET)
186libc_hidden_builtin_def (MEMSET)
187