1/* Set a block of memory to some byte value.  For SUN4V M7.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifndef XCC
22# define XCC    xcc
23#endif
24	.register	%g2, #scratch
25	.register	%g3, #scratch
26
27/* The algorithm is as follows :
28 *
29 *	For small 7 or fewer bytes stores, bytes will be stored.
30 *
31 *	For less than 32 bytes stores, align the address on 4 byte boundary.
32 *	Then store as many 4-byte chunks, followed by trailing bytes.
33 *
34 *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
35 *	if (count >= 64) {
36 *		store 8-bytes chunks to align the address on 64 byte boundary
37 *		if (value to be set is zero && count >= MIN_ZERO) {
38 *			Using BIS stores, set the first long word of each
39 *			64-byte cache line to zero which will also clear the
40 *			other seven long words of the cache line.
41 *		}
42 *		else if (count >= MIN_LOOP) {
43 *			Using BIS stores, set the first long word of each of
44 *			ST_CHUNK cache lines (64 bytes each) before the main
45 *			loop is entered.
46 *			In the main loop, continue pre-setting the first long
47 *			word of each cache line ST_CHUNK lines in advance while
48 *			setting the other seven long words (56 bytes) of each
49 *			cache line until fewer than ST_CHUNK*64 bytes remain.
50 *			Then set the remaining seven long words of each cache
51 *			line that has already had its first long word set.
52 *		}
53 *		store remaining data in 64-byte chunks until less than
54 *		64 bytes remain.
55 *	}
56 *	Store as many 8-byte chunks, followed by trailing bytes.
57 *
58 *
59 * BIS = Block Init Store
60 *   Doing the advance store of the first element of the cache line
61 *   initiates the displacement of a cache line while only using a single
62 *   instruction in the pipeline. That avoids various pipeline delays,
63 *   such as filling the miss buffer. The performance effect is
64 *   similar to prefetching for normal stores.
65 *   The special case for zero fills runs faster and uses fewer instruction
66 *   cycles than the normal memset loop.
67 *
68 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
69 * BIS stores must be followed by a membar #StoreStore. The benefit of
70 * the BIS store must be balanced against the cost of the membar operation.
71 */
72
73/*
74 * ASI_STBI_P marks the cache line as "least recently used"
75 * which means if many threads are active, it has a high chance
76 * of being pushed out of the cache between the first initializing
77 * store and the final stores.
78 * Thus, we use ASI_STBIMRU_P which marks the cache line as
79 * "most recently used" for all but the last store to the cache line.
80 */
81
82#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
83#define ASI_ST_BLK_INIT_MRU_P 0xf2
84
85#define ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
86#define ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P
87
88#define ST_CHUNK	24   /* multiple of 4 due to loop unrolling */
89#define MIN_LOOP	(ST_CHUNK)*64
90#define MIN_ZERO	256
91
92#define EX_ST(x)	x
93#define EX_RETVAL(x)	x
94#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
95#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
96
97#if IS_IN (libc)
98
99	.text
100	.align		32
101
102ENTRY(__bzero_niagara7)
103	/* bzero (dst, size)  */
104	mov	%o1, %o2
105	mov	0, %o1
106	/* fall through into memset code */
107END(__bzero_niagara7)
108
109ENTRY(__memset_niagara7)
110	/* memset (src, c, size)  */
111	mov	%o0, %o5		/* copy sp1 before using it  */
112	cmp	%o2, 7			/* if small counts, just write bytes  */
113	bleu,pn %XCC, .Lwrchar
114	 and	%o1, 0xff, %o1		/* o1 is (char)c  */
115
116	sll	%o1, 8, %o3
117	or	%o1, %o3, %o1		/* now o1 has 2 bytes of c  */
118	sll	%o1, 16, %o3
119	cmp	%o2, 32
120	blu,pn	%XCC, .Lwdalign
121	 or	%o1, %o3, %o1		/* now o1 has 4 bytes of c  */
122
123	sllx	%o1, 32, %o3
124	or	%o1, %o3, %o1		/* now o1 has 8 bytes of c  */
125
126.Ldbalign:
127	andcc	%o5, 7, %o3		/* is sp1 aligned on a 8 byte bound?  */
128	bz,pt	%XCC, .Lblkalign	/* already long word aligned  */
129	 sub	%o3, 8, %o3		/* -(bytes till long word aligned)  */
130
131	add	%o2, %o3, %o2		/* update o2 with new count  */
132	/* Set -(%o3) bytes till sp1 long word aligned  */
1331:	stb	%o1, [%o5]		/* there is at least 1 byte to set  */
134	inccc	%o3			/* byte clearing loop   */
135	bl,pt	%XCC, 1b
136	 inc	%o5
137
138	/* Now sp1 is long word aligned (sp1 is found in %o5) */
139.Lblkalign:
140	cmp	%o2, 64		/* check if there are 64 bytes to set  */
141	blu,pn	%XCC, .Lwrshort
142	 mov	%o2, %o3
143
144	andcc	%o5, 63, %o3		/* is sp1 block aligned?  */
145	bz,pt	%XCC, .Lblkwr		/* now block aligned  */
146	 sub	%o3, 64, %o3		/* o3 is -(bytes till block aligned)  */
147	add	%o2, %o3, %o2		/* o2 is the remainder  */
148
149	/* Store -(%o3) bytes till dst is block (64 byte) aligned.  */
150	/* Use long word stores.  */
151	/* Recall that dst is already long word aligned  */
1521:
153	addcc	%o3, 8, %o3
154	stx	%o1, [%o5]
155	bl,pt	%XCC, 1b
156	 add	%o5, 8, %o5
157
158	/* Now sp1 is block aligned  */
159.Lblkwr:
160	andn	%o2, 63, %o4		/* calculate size of blocks in bytes  */
161	brz,pn	%o1, .Lwrzero		/* special case if c == 0  */
162	 and	%o2, 63, %o3		/* %o3 = bytes left after blk stores  */
163
164	cmp	%o4, MIN_LOOP		/* check for enough bytes to set  */
165	blu,pn	%XCC, .Lshort_set	/* to justify cost of membar   */
166	 nop				/* must be > pre-cleared lines  */
167
168	/* initial cache-clearing stores  */
169	/* get store pipeline moving  */
170
171/*	Primary memset loop for large memsets  */
172.Lwr_loop:
173	mov	ST_CHUNK, %g1
174.Lwr_loop_start:
175	subcc	%g1, 4, %g1
176	EX_ST(STORE_ASI(%o1,%o5))
177	add	%o5, 64, %o5
178	EX_ST(STORE_ASI(%o1,%o5))
179	add	%o5, 64, %o5
180	EX_ST(STORE_ASI(%o1,%o5))
181	add	%o5, 64, %o5
182	EX_ST(STORE_ASI(%o1,%o5))
183	bgu	%XCC, .Lwr_loop_start
184	 add	%o5, 64, %o5
185
186	sub	%o5, ST_CHUNK*64, %o5	/* reset %o5  */
187	mov	ST_CHUNK, %g1
188	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
189
190.Lwr_loop_rest:
191	stx	%o1,[%o5+8+8]
192	sub	%o4, 64, %o4
193	stx	%o1,[%o5+16+8]
194	subcc	%g1, 1, %g1
195	stx	%o1,[%o5+24+8]
196	stx	%o1,[%o5+32+8]
197	stx	%o1,[%o5+40+8]
198	add	%o5, 64, %o5
199	stx	%o1,[%o5-8]
200	bgu	%XCC, .Lwr_loop_rest
201	 EX_ST(STORE_INIT(%o1,%o5))
202
203	 add	%o5, 8, %o5		/* restore %o5 offset  */
204
205	/* If more than ST_CHUNK*64 bytes remain to set, continue  */
206	/* setting the first long word of each cache line in advance  */
207	/* to keep the store pipeline moving.  */
208
209	cmp	%o4, ST_CHUNK*64
210	bge,pt	%XCC, .Lwr_loop_start
211	 mov	ST_CHUNK, %g1
212
213	brz,a,pn %o4, .Lasi_done
214	 nop
215
216	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
217.Lwr_loop_small:
218	add	%o5, 8, %o5		/* adjust %o5 for ASI store  */
219	EX_ST(STORE_ASI(%o1,%o5))
220	stx	%o1,[%o5+8]
221	stx	%o1,[%o5+16]
222	stx	%o1,[%o5+24]
223	stx	%o1,[%o5+32]
224	subcc	%o4, 64, %o4
225	stx	%o1,[%o5+40]
226	add	%o5, 56, %o5
227	stx	%o1,[%o5-8]
228	bgu,pt	%XCC, .Lwr_loop_small
229	 EX_ST(STORE_INIT(%o1,%o5))
230
231	ba	.Lasi_done
232	 add	%o5, 8, %o5		/* restore %o5 offset  */
233
234/*	Special case loop for zero fill memsets  */
235/*	For each 64 byte cache line, single STBI to first element  */
236/*	clears line  */
237.Lwrzero:
238	cmp	%o4, MIN_ZERO		/* check if enough bytes to set  */
239					/* to pay %asi + membar cost  */
240	blu	%XCC, .Lshort_set
241	 nop
242	sub	%o4, 256, %o4
243
244.Lwrzero_loop:
245	mov	64, %g3
246	EX_ST(STORE_INIT(%o1,%o5))
247	subcc	%o4, 256, %o4
248	EX_ST(STORE_INIT(%o1,%o5+%g3))
249	add	%o5, 256, %o5
250	sub	%g3, 192, %g3
251	EX_ST(STORE_INIT(%o1,%o5+%g3))
252	add %g3, 64, %g3
253	bge,pt	%XCC, .Lwrzero_loop
254	 EX_ST(STORE_INIT(%o1,%o5+%g3))
255	add	%o4, 256, %o4
256
257	brz,pn	%o4, .Lbsi_done
258	 nop
259.Lwrzero_small:
260	EX_ST(STORE_INIT(%o1,%o5))
261	subcc	%o4, 64, %o4
262	bgu,pt	%XCC, .Lwrzero_small
263	 add	%o5, 64, %o5
264
265.Lasi_done:
266.Lbsi_done:
267	membar	#StoreStore		/* required by use of BSI  */
268
269.Lshort_set:
270	cmp	%o4, 64			/* check if 64 bytes to set  */
271	blu	%XCC, 5f
272	 nop
2734:					/* set final blocks of 64 bytes  */
274	stx	%o1, [%o5]
275	stx	%o1, [%o5+8]
276	stx	%o1, [%o5+16]
277	stx	%o1, [%o5+24]
278	subcc	%o4, 64, %o4
279	stx	%o1, [%o5+32]
280	stx	%o1, [%o5+40]
281	add	%o5, 64, %o5
282	stx	%o1, [%o5-16]
283	bgu,pt	%XCC, 4b
284	 stx	%o1, [%o5-8]
285
2865:
287	/* Set the remaining long words  */
288.Lwrshort:
289	subcc	%o3, 8, %o3		/* Can we store any long words?  */
290	blu,pn	%XCC, .Lwrchars
291	 and	%o2, 7, %o2		/* calc bytes left after long words  */
2926:
293	subcc	%o3, 8, %o3
294	stx	%o1, [%o5]		/* store the long words  */
295	bgeu,pt %XCC, 6b
296	 add	%o5, 8, %o5
297
298.Lwrchars:				/* check for extra chars  */
299	brnz	%o2, .Lwrfin
300	 nop
301	retl
302	 nop
303
304.Lwdalign:
305	andcc	%o5, 3, %o3		/* is sp1 aligned on a word boundary  */
306	bz,pn	%XCC, .Lwrword
307	 andn	%o2, 3, %o3		/* create word sized count in %o3  */
308
309	dec	%o2			/* decrement count  */
310	stb	%o1, [%o5]		/* clear a byte  */
311	b	.Lwdalign
312	 inc	%o5			/* next byte  */
313
314.Lwrword:
315	subcc	%o3, 4, %o3
316	st	%o1, [%o5]		/* 4-byte writing loop  */
317	bnz,pt	%XCC, .Lwrword
318	 add	%o5, 4, %o5
319	and	%o2, 3, %o2		/* leftover count, if any  */
320
321.Lwrchar:
322	/* Set the remaining bytes, if any  */
323	brz	%o2, .Lexit
324	 nop
325.Lwrfin:
326	deccc	%o2
327	stb	%o1, [%o5]
328	bgu,pt	%XCC, .Lwrfin
329	 inc	%o5
330.Lexit:
331	retl				/* %o0 was preserved  */
332	 nop
333END(__memset_niagara7)
334#endif
335