1/* Optimized version of the standard bzero() function.
2   This file is part of the GNU C Library.
3   Copyright (C) 2000-2021 Free Software Foundation, Inc.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* Return: dest
20
21   Inputs:
22        in0:    dest
23        in1:    count
24
25   The algorithm is fairly straightforward: set byte by byte until we
26   we get to a 16B-aligned address, then loop on 128 B chunks using an
27   early store as prefetching, then loop on 32B chucks, then clear remaining
28   words, finally clear remaining bytes.
29   Since a stf.spill f0 can store 16B in one go, we use this instruction
30   to get peak speed.  */
31
32#include <sysdep.h>
33#undef ret
34
35#define dest		in0
36#define	cnt		in1
37
38#define tmp		r31
39#define save_lc		r30
40#define ptr0		r29
41#define ptr1		r28
42#define ptr2		r27
43#define ptr3		r26
44#define ptr9 		r24
45#define	loopcnt		r23
46#define linecnt		r22
47#define bytecnt		r21
48
49// This routine uses only scratch predicate registers (p6 - p15)
50#define p_scr		p6	// default register for same-cycle branches
51#define p_unalgn	p9
52#define p_y		p11
53#define p_n		p12
54#define p_yy		p13
55#define p_nn		p14
56
57#define movi0		mov
58
59#define MIN1		15
60#define MIN1P1HALF	8
61#define LINE_SIZE	128
62#define LSIZE_SH        7			// shift amount
63#define PREF_AHEAD	8
64
65#define USE_FLP
66#if defined(USE_INT)
67#define store		st8
68#define myval		r0
69#elif defined(USE_FLP)
70#define store		stf8
71#define myval		f0
72#endif
73
74.align	64
75ENTRY(bzero)
76{ .mmi
77	.prologue
78	alloc	tmp = ar.pfs, 2, 0, 0, 0
79	lfetch.nt1 [dest]
80	.save   ar.lc, save_lc
81	movi0	save_lc = ar.lc
82} { .mmi
83	.body
84	mov	ret0 = dest		// return value
85	nop.m	0
86	cmp.eq	p_scr, p0 = cnt, r0
87;; }
88{ .mmi
89	and	ptr2 = -(MIN1+1), dest	// aligned address
90	and	tmp = MIN1, dest	// prepare to check for alignment
91	tbit.nz p_y, p_n = dest, 0	// Do we have an odd address? (M_B_U)
92} { .mib
93	mov	ptr1 = dest
94	nop.i	0
95(p_scr)	br.ret.dpnt.many rp		// return immediately if count = 0
96;; }
97{ .mib
98	cmp.ne	p_unalgn, p0 = tmp, r0
99} { .mib					// NB: # of bytes to move is 1
100	sub	bytecnt = (MIN1+1), tmp		//     higher than loopcnt
101	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
102(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
103;; }
104{ .mmi
105(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
106(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
107(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
108;; }
109{ .mib
110(p_y)	add	cnt = -8, cnt
111(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
112} { .mib
113(p_y)	st8	[ptr2] = r0,-4
114(p_n)	add	ptr2 = 4, ptr2
115;; }
116{ .mib
117(p_yy)	add	cnt = -4, cnt
118(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
119} { .mib
120(p_yy)	st4	[ptr2] = r0,-2
121(p_nn)	add	ptr2 = 2, ptr2
122;; }
123{ .mmi
124	mov	tmp = LINE_SIZE+1		// for compare
125(p_y)	add	cnt = -2, cnt
126(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
127} { .mmi
128	nop.m	0
129(p_y)	st2	[ptr2] = r0,-1
130(p_n)	add	ptr2 = 1, ptr2
131;; }
132
133{ .mmi
134(p_yy)	st1	[ptr2] = r0
135	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
136} { .mbb
137(p_yy)	add	cnt = -1, cnt
138(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
139;; }
140{ .mib
141	nop.m 	0
142	shr.u	linecnt = cnt, LSIZE_SH
143	nop.b	0
144;; }
145
146	.align 32
147.l1b:	// ------------------//  L1B: store ahead into cache lines; fill later
148{ .mmi
149	and	tmp = -(LINE_SIZE), cnt		// compute end of range
150	mov	ptr9 = ptr1			// used for prefetching
151	and	cnt = (LINE_SIZE-1), cnt	// remainder
152} { .mmi
153	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
154	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
155;; }
156{ .mmi
157(p_scr)	add	loopcnt = -1, linecnt
158	add	ptr2 = 16, ptr1	// start of stores (beyond prefetch stores)
159	add	ptr1 = tmp, ptr1	// first address beyond total range
160;; }
161{ .mmi
162	add	tmp = -1, linecnt	// next loop count
163	movi0	ar.lc = loopcnt
164;; }
165.pref_l1b:
166{ .mib
167	stf.spill [ptr9] = f0, 128	// Do stores one cache line apart
168	nop.i   0
169	br.cloop.dptk.few .pref_l1b
170;; }
171{ .mmi
172	add	ptr0 = 16, ptr2		// Two stores in parallel
173	movi0	ar.lc = tmp
174;; }
175.l1bx:
176 { .mmi
177	stf.spill [ptr2] = f0, 32
178	stf.spill [ptr0] = f0, 32
179 ;; }
180 { .mmi
181	stf.spill [ptr2] = f0, 32
182	stf.spill [ptr0] = f0, 32
183 ;; }
184 { .mmi
185	stf.spill [ptr2] = f0, 32
186	stf.spill [ptr0] = f0, 64
187	cmp.lt	p_scr, p0 = ptr9, ptr1	// do we need more prefetching?
188 ;; }
189{ .mmb
190	stf.spill [ptr2] = f0, 32
191(p_scr)	stf.spill [ptr9] = f0, 128
192	br.cloop.dptk.few .l1bx
193;; }
194{ .mib
195	cmp.gt  p_scr, p0 = 8, cnt	// just a few bytes left ?
196(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment
197;; }
198
199.fraction_of_line:
200{ .mib
201	add	ptr2 = 16, ptr1
202	shr.u	loopcnt = cnt, 5   	// loopcnt = cnt / 32
203;; }
204{ .mib
205	cmp.eq	p_scr, p0 = loopcnt, r0
206	add	loopcnt = -1, loopcnt
207(p_scr)	br.cond.dpnt.many .store_words
208;; }
209{ .mib
210	and	cnt = 0x1f, cnt		// compute the remaining cnt
211	movi0   ar.lc = loopcnt
212;; }
213	.align 32
214.l2:	// -----------------------------//  L2A:  store 32B in 2 cycles
215{ .mmb
216	store	[ptr1] = myval, 8
217	store	[ptr2] = myval, 8
218;; } { .mmb
219	store	[ptr1] = myval, 24
220	store	[ptr2] = myval, 24
221	br.cloop.dptk.many .l2
222;; }
223.store_words:
224{ .mib
225	cmp.gt	p_scr, p0 = 8, cnt	// just a few bytes left ?
226(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
227;; }
228
229{ .mmi
230	store	[ptr1] = myval, 8	// store
231	cmp.le	p_y, p_n = 16, cnt	//
232	add	cnt = -8, cnt		// subtract
233;; }
234{ .mmi
235(p_y)	store	[ptr1] = myval, 8	// store
236(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
237(p_y)	add	cnt = -8, cnt		// subtract
238;; }
239{ .mmi					// store
240(p_yy)	store	[ptr1] = myval, 8
241(p_yy)	add	cnt = -8, cnt		// subtract
242;; }
243
244.move_bytes_from_alignment:
245{ .mib
246	cmp.eq	p_scr, p0 = cnt, r0
247	tbit.nz.unc p_y, p0 = cnt, 2	// should we terminate with a st4 ?
248(p_scr)	br.cond.dpnt.few .restore_and_exit
249;; }
250{ .mib
251(p_y)	st4	[ptr1] = r0,4
252	tbit.nz.unc p_yy, p0 = cnt, 1	// should we terminate with a st2 ?
253;; }
254{ .mib
255(p_yy)	st2	[ptr1] = r0,2
256	tbit.nz.unc p_y, p0 = cnt, 0	// should we terminate with a st1 ?
257;; }
258
259{ .mib
260(p_y)	st1	[ptr1] = r0
261;; }
262.restore_and_exit:
263{ .mib
264	nop.m	0
265	movi0	ar.lc = save_lc
266	br.ret.sptk.many rp
267;; }
268
269.move_bytes_unaligned:
270{ .mmi
271       .pred.rel "mutex",p_y, p_n
272       .pred.rel "mutex",p_yy, p_nn
273(p_n)	cmp.le  p_yy, p_nn = 4, cnt
274(p_y)	cmp.le  p_yy, p_nn = 5, cnt
275(p_n)	add	ptr2 = 2, ptr1
276} { .mmi
277(p_y)	add	ptr2 = 3, ptr1
278(p_y)	st1	[ptr1] = r0, 1		// fill 1 (odd-aligned) byte
279(p_y)	add	cnt = -1, cnt		// [15, 14 (or less) left]
280;; }
281{ .mmi
282(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
283	add	ptr3 = ptr1, cnt	// prepare last store
284	movi0	ar.lc = save_lc
285} { .mmi
286(p_yy)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
287(p_yy)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
288(p_yy)	add	cnt = -4, cnt		// [11, 10 (o less) left]
289;; }
290{ .mmi
291(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
292	add	ptr3 = -1, ptr3		// last store
293	tbit.nz p_scr, p0 = cnt, 1	// will there be a st2 at the end ?
294} { .mmi
295(p_y)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
296(p_y)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
297(p_y)	add	cnt = -4, cnt		// [7, 6 (or less) left]
298;; }
299{ .mmi
300(p_yy)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
301(p_yy)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
302					// [3, 2 (or less) left]
303	tbit.nz p_y, p0 = cnt, 0	// will there be a st1 at the end ?
304} { .mmi
305(p_yy)	add	cnt = -4, cnt
306;; }
307{ .mmb
308(p_scr)	st2	[ptr1] = r0		// fill 2 (aligned) bytes
309(p_y)	st1	[ptr3] = r0		// fill last byte (using ptr3)
310	br.ret.sptk.many rp
311;; }
312END(bzero)
313