1/* Optimized 32-bit memset implementation for POWER6.
2   Copyright (C) 1997-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.
23
24   The memset is done in three sizes: byte (8 bits), word (32 bits),
25   cache line (1024 bits). There is a special case for setting cache lines
26   to 0, to take advantage of the dcbz instruction.  */
27
28	.machine power6
29EALIGN (memset, 7, 0)
30	CALL_MCOUNT
31
32#define rTMP	r0
33#define rRTN	r3	/* Initial value of 1st argument.  */
34#define rMEMP0	r3	/* Original value of 1st arg.  */
35#define rCHR	r4	/* Char to set in each byte.  */
36#define rLEN	r5	/* Length of region to set.  */
37#define rMEMP	r6	/* Address at which we are storing.  */
38#define rALIGN	r7	/* Number of bytes we are setting now (when aligning). */
39#define rMEMP2	r8
40
41#define rNEG64	r8	/* Constant -64 for clearing with dcbz.  */
42#define rMEMP3	r9	/* Alt mem pointer.  */
43L(_memset):
44/* Take care of case for size <= 4.  */
45	cmplwi	cr1, rLEN, 4
46	andi.	rALIGN, rMEMP0, 3
47	mr	rMEMP, rMEMP0
48	ble-	cr1, L(small)
49/* Align to word boundary.  */
50	cmplwi	cr5, rLEN, 31
51	insrwi	rCHR, rCHR, 8, 16	/* Replicate byte to halfword.  */
52	beq+	L(aligned)
53	mtcrf	0x01, rMEMP0
54	subfic	rALIGN, rALIGN, 4
55	add	rMEMP, rMEMP, rALIGN
56	sub	rLEN, rLEN, rALIGN
57	bf+	31, L(g0)
58	stb	rCHR, 0(rMEMP0)
59	bt	30, L(aligned)
60L(g0):
61	sth	rCHR, -2(rMEMP)
62
63        .align 4
64/* Handle the case of size < 31.  */
65L(aligned):
66	mtcrf	0x01, rLEN
67	insrwi	rCHR, rCHR, 16, 0	/* Replicate halfword to word.  */
68	ble	cr5, L(medium)
69/* Align to 32-byte boundary.  */
70	andi.	rALIGN, rMEMP, 0x1C
71	subfic	rALIGN, rALIGN, 0x20
72	beq	L(caligned)
73	mtcrf	0x01, rALIGN
74	add	rMEMP, rMEMP, rALIGN
75	sub	rLEN, rLEN, rALIGN
76	cmplwi	cr1, rALIGN, 0x10
77	mr	rMEMP2, rMEMP
78	bf	28, L(a1)
79        stw     rCHR, -4(rMEMP2)
80	stwu	rCHR, -8(rMEMP2)
81	nop
82L(a1):	blt	cr1, L(a2)
83        stw     rCHR, -4(rMEMP2)
84	stw	rCHR, -8(rMEMP2)
85	stw	rCHR, -12(rMEMP2)
86	stwu	rCHR, -16(rMEMP2)
87L(a2):  bf      29, L(caligned)
88        stw     rCHR, -4(rMEMP2)
89
90        .align 3
91/* Now aligned to a 32 byte boundary.  */
92L(caligned):
93	cmplwi	cr1, rCHR, 0
94	clrrwi.	rALIGN, rLEN, 5
95	mtcrf	0x01, rLEN
96	beq	cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
97L(nondcbz):
98	beq	L(medium)	/* We may not actually get to do a full line.  */
99	nop
100/* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
101   boundary may not be at cache line (128-byte) boundary.  */
102L(nzloopstart):
103/* memset in 32-byte chunks until we get to a cache line boundary.
104   If rLEN is less than the distance to the next cache-line boundary use
105   cacheAligned1 code to finish the tail.  */
106	cmplwi	cr1,rLEN,128
107
108	andi.	rTMP,rMEMP,127
109	blt	cr1,L(cacheAligned1)
110	addi	rMEMP3,rMEMP,32
111	beq	L(nzCacheAligned)
112	addi	rLEN,rLEN,-32
113	stw	rCHR,0(rMEMP)
114        stw     rCHR,4(rMEMP)
115	stw	rCHR,8(rMEMP)
116	stw     rCHR,12(rMEMP)
117	stw	rCHR,16(rMEMP)
118        stw     rCHR,20(rMEMP)
119	addi	rMEMP,rMEMP,32
120	andi.	rTMP,rMEMP3,127
121	stw	rCHR,-8(rMEMP3)
122        stw     rCHR,-4(rMEMP3)
123
124	beq	L(nzCacheAligned)
125	addi	rLEN,rLEN,-32
126	stw	rCHR,0(rMEMP3)
127        stw     rCHR,4(rMEMP3)
128	addi	rMEMP,rMEMP,32
129	stw	rCHR,8(rMEMP3)
130	stw     rCHR,12(rMEMP3)
131	andi.	rTMP,rMEMP,127
132	stw	rCHR,16(rMEMP3)
133        stw     rCHR,20(rMEMP3)
134	stw	rCHR,24(rMEMP3)
135        stw     rCHR,28(rMEMP3)
136
137	beq	L(nzCacheAligned)
138	addi	rLEN,rLEN,-32
139/* At this point we can overrun the store queue (pipe reject) so it is
140   time to slow things down. The store queue can merge two adjacent
141   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
142   So we add "group ending nops" to guarantee that we dispatch only two
143   stores every other cycle. */
144	ori	r1,r1,0
145	ori	r1,r1,0
146	stw	rCHR,32(rMEMP3)
147        stw     rCHR,36(rMEMP3)
148	addi	rMEMP,rMEMP,32
149	cmplwi	cr1,rLEN,128
150	ori	r1,r1,0
151	ori	r1,r1,0
152	stw	rCHR,40(rMEMP3)
153	stw     rCHR,44(rMEMP3)
154	ori	r1,r1,0
155	ori	r1,r1,0
156	stw	rCHR,48(rMEMP3)
157        stw     rCHR,52(rMEMP3)
158	ori	r1,r1,0
159	ori	r1,r1,0
160	stw	rCHR,56(rMEMP3)
161        stw     rCHR,60(rMEMP3)
162	blt	cr1,L(cacheAligned1)
163	b	L(nzCacheAligned)
164
165/* Now we are aligned to the cache line and can use dcbtst.  */
166        .align 5
167L(nzCacheAligned):
168	cmplwi	cr1,rLEN,128
169	cmplwi	cr6,rLEN,256
170	blt	cr1,L(cacheAligned1)
171	blt	cr6,L(nzCacheAligned128)
172        .align 4
173L(nzCacheAligned128):
174	nop
175	addi	rMEMP3,rMEMP,64
176	stw	rCHR,0(rMEMP)
177        stw     rCHR,4(rMEMP)
178	stw	rCHR,8(rMEMP)
179	stw     rCHR,12(rMEMP)
180	stw	rCHR,16(rMEMP)
181        stw     rCHR,20(rMEMP)
182	stw	rCHR,24(rMEMP)
183        stw     rCHR,28(rMEMP)
184	stw	rCHR,32(rMEMP)
185        stw     rCHR,36(rMEMP)
186	stw	rCHR,40(rMEMP)
187	stw     rCHR,44(rMEMP)
188	stw	rCHR,48(rMEMP)
189        stw     rCHR,52(rMEMP)
190	stw	rCHR,56(rMEMP)
191        stw     rCHR,60(rMEMP)
192	addi	rMEMP,rMEMP3,64
193	addi	rLEN,rLEN,-128
194/* At this point we can overrun the store queue (pipe reject) so it is
195   time to slow things down. The store queue can merge two adjacent
196   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
197   So we add "group ending nops" to guarantee that we dispatch only one
198   store per cycle. */
199	stw	rCHR,0(rMEMP3)
200	ori	r1,r1,0
201        stw     rCHR,4(rMEMP3)
202	ori	r1,r1,0
203	stw	rCHR,8(rMEMP3)
204	ori	r1,r1,0
205	stw     rCHR,12(rMEMP3)
206	ori	r1,r1,0
207	stw	rCHR,16(rMEMP3)
208	ori	r1,r1,0
209        stw     rCHR,20(rMEMP3)
210	ori	r1,r1,0
211	stw	rCHR,24(rMEMP3)
212	ori	r1,r1,0
213        stw     rCHR,28(rMEMP3)
214	ori	r1,r1,0
215	stw	rCHR,32(rMEMP3)
216	ori	r1,r1,0
217        stw     rCHR,36(rMEMP3)
218	ori	r1,r1,0
219	stw	rCHR,40(rMEMP3)
220	ori	r1,r1,0
221	stw     rCHR,44(rMEMP3)
222	ori	r1,r1,0
223	stw	rCHR,48(rMEMP3)
224	ori	r1,r1,0
225        stw     rCHR,52(rMEMP3)
226	ori	r1,r1,0
227	stw	rCHR,56(rMEMP3)
228	ori	r1,r1,0
229        stw     rCHR,60(rMEMP3)
230	blt	cr6,L(cacheAligned1)
231#if IS_IN (libc)
232	lfd	0,-128(rMEMP)
233#endif
234	b	L(nzCacheAligned256)
235        .align 5
236L(nzCacheAligned256):
237	cmplwi	cr1,rLEN,256
238	addi	rMEMP3,rMEMP,64
239#if !IS_IN (libc)
240/* When we are not in libc we should use only GPRs to avoid the FPU lock
241   interrupt.  */
242	stw	rCHR,0(rMEMP)
243        stw     rCHR,4(rMEMP)
244	stw	rCHR,8(rMEMP)
245	stw     rCHR,12(rMEMP)
246	stw	rCHR,16(rMEMP)
247        stw     rCHR,20(rMEMP)
248	stw	rCHR,24(rMEMP)
249        stw     rCHR,28(rMEMP)
250	stw	rCHR,32(rMEMP)
251        stw     rCHR,36(rMEMP)
252	stw	rCHR,40(rMEMP)
253	stw     rCHR,44(rMEMP)
254	stw	rCHR,48(rMEMP)
255        stw     rCHR,52(rMEMP)
256	stw	rCHR,56(rMEMP)
257        stw     rCHR,60(rMEMP)
258	addi	rMEMP,rMEMP3,64
259	addi	rLEN,rLEN,-128
260	stw	rCHR,0(rMEMP3)
261        stw     rCHR,4(rMEMP3)
262	stw	rCHR,8(rMEMP3)
263	stw     rCHR,12(rMEMP3)
264	stw	rCHR,16(rMEMP3)
265        stw     rCHR,20(rMEMP3)
266	stw	rCHR,24(rMEMP3)
267        stw     rCHR,28(rMEMP3)
268	stw	rCHR,32(rMEMP3)
269        stw     rCHR,36(rMEMP3)
270	stw	rCHR,40(rMEMP3)
271	stw     rCHR,44(rMEMP3)
272	stw	rCHR,48(rMEMP3)
273        stw     rCHR,52(rMEMP3)
274	stw	rCHR,56(rMEMP3)
275        stw     rCHR,60(rMEMP3)
276#else
277/* We are in libc and this is a long memset so we can use FPRs and can afford
278   occasional FPU locked interrupts.  */
279	stfd	0,0(rMEMP)
280	stfd	0,8(rMEMP)
281	stfd	0,16(rMEMP)
282	stfd	0,24(rMEMP)
283	stfd	0,32(rMEMP)
284	stfd	0,40(rMEMP)
285	stfd	0,48(rMEMP)
286	stfd	0,56(rMEMP)
287	addi	rMEMP,rMEMP3,64
288	addi	rLEN,rLEN,-128
289	stfd	0,0(rMEMP3)
290	stfd	0,8(rMEMP3)
291	stfd	0,16(rMEMP3)
292	stfd	0,24(rMEMP3)
293	stfd	0,32(rMEMP3)
294	stfd	0,40(rMEMP3)
295	stfd	0,48(rMEMP3)
296	stfd	0,56(rMEMP3)
297#endif
298	bge	cr1,L(nzCacheAligned256)
299	dcbtst	0,rMEMP
300	b	L(cacheAligned1)
301
302	.align 4
303/* Storing a zero "c" value. We are aligned at a sector (32-byte)
304   boundary but may not be at cache line (128-byte) boundary.  If the
305   remaining length spans a full cache line we can use the Data cache
306   block zero instruction. */
307L(zloopstart):
308/* memset in 32-byte chunks until we get to a cache line boundary.
309   If rLEN is less than the distance to the next cache-line boundary use
310   cacheAligned1 code to finish the tail.  */
311	cmplwi	cr1,rLEN,128
312	beq	L(medium)
313L(getCacheAligned):
314	andi.	rTMP,rMEMP,127
315	blt	cr1,L(cacheAligned1)
316	addi	rMEMP3,rMEMP,32
317	beq	L(cacheAligned)
318	addi	rLEN,rLEN,-32
319	stw	rCHR,0(rMEMP)
320        stw     rCHR,4(rMEMP)
321	stw	rCHR,8(rMEMP)
322	stw     rCHR,12(rMEMP)
323	stw	rCHR,16(rMEMP)
324        stw     rCHR,20(rMEMP)
325	addi	rMEMP,rMEMP,32
326	andi.	rTMP,rMEMP3,127
327	stw	rCHR,-8(rMEMP3)
328        stw     rCHR,-4(rMEMP3)
329L(getCacheAligned2):
330	beq	L(cacheAligned)
331	addi	rLEN,rLEN,-32
332	addi	rMEMP,rMEMP,32
333	stw	rCHR,0(rMEMP3)
334        stw     rCHR,4(rMEMP3)
335	stw	rCHR,8(rMEMP3)
336	stw     rCHR,12(rMEMP3)
337	andi.	rTMP,rMEMP,127
338	nop
339	stw	rCHR,16(rMEMP3)
340        stw     rCHR,20(rMEMP3)
341	stw	rCHR,24(rMEMP3)
342        stw     rCHR,28(rMEMP3)
343L(getCacheAligned3):
344	beq	L(cacheAligned)
345/* At this point we can overrun the store queue (pipe reject) so it is
346   time to slow things down. The store queue can merge two adjacent
347   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
348   So we add "group ending nops" to guarantee that we dispatch only two
349   stores every other cycle. */
350	addi	rLEN,rLEN,-32
351	ori	r1,r1,0
352	ori	r1,r1,0
353	stw	rCHR,32(rMEMP3)
354        stw     rCHR,36(rMEMP3)
355	addi	rMEMP,rMEMP,32
356	cmplwi	cr1,rLEN,128
357	ori	r1,r1,0
358	stw	rCHR,40(rMEMP3)
359	stw     rCHR,44(rMEMP3)
360	cmplwi	cr6,rLEN,256
361	li	rMEMP2,128
362	ori	r1,r1,0
363	stw	rCHR,48(rMEMP3)
364        stw     rCHR,52(rMEMP3)
365	ori	r1,r1,0
366	ori	r1,r1,0
367	stw	rCHR,56(rMEMP3)
368        stw     rCHR,60(rMEMP3)
369	blt	cr1,L(cacheAligned1)
370	blt	cr6,L(cacheAligned128)
371	b	L(cacheAlignedx)
372
373/* Now we are aligned to the cache line and can use dcbz.  */
374        .align 4
375L(cacheAligned):
376	cmplwi	cr1,rLEN,128
377	cmplwi	cr6,rLEN,256
378	blt	cr1,L(cacheAligned1)
379	li	rMEMP2,128
380L(cacheAlignedx):
381	cmplwi	cr5,rLEN,640
382	blt	cr6,L(cacheAligned128)
383	bgt	cr5,L(cacheAligned512)
384	cmplwi	cr6,rLEN,512
385	dcbz	0,rMEMP
386	cmplwi	cr1,rLEN,384
387	dcbz	rMEMP2,rMEMP
388	addi	rMEMP,rMEMP,256
389	addi	rLEN,rLEN,-256
390	blt	cr1,L(cacheAligned1)
391	blt	cr6,L(cacheAligned128)
392	b	L(cacheAligned256)
393	.align 5
394/* A simple loop for the longer (>640 bytes) lengths.  This form limits
395   the branch miss-predicted to exactly 1 at loop exit.*/
396L(cacheAligned512):
397	cmplwi	cr1,rLEN,128
398	blt	cr1,L(cacheAligned1)
399	dcbz	0,rMEMP
400	addi	rLEN,rLEN,-128
401	addi	rMEMP,rMEMP,128
402	b	L(cacheAligned512)
403        .align 5
404L(cacheAligned256):
405	cmplwi	cr6,rLEN,512
406	dcbz	0,rMEMP
407	cmplwi	cr1,rLEN,384
408	dcbz	rMEMP2,rMEMP
409	addi	rMEMP,rMEMP,256
410	addi	rLEN,rLEN,-256
411	bge	cr6,L(cacheAligned256)
412	blt	cr1,L(cacheAligned1)
413        .align 4
414L(cacheAligned128):
415	dcbz	0,rMEMP
416	addi	rMEMP,rMEMP,128
417	addi	rLEN,rLEN,-128
418        .align 4
419L(cacheAligned1):
420	cmplwi	cr1,rLEN,32
421	blt	cr1,L(handletail32)
422	addi	rMEMP3,rMEMP,32
423	addi	rLEN,rLEN,-32
424	stw	rCHR,0(rMEMP)
425        stw     rCHR,4(rMEMP)
426	stw	rCHR,8(rMEMP)
427	stw     rCHR,12(rMEMP)
428	stw	rCHR,16(rMEMP)
429        stw     rCHR,20(rMEMP)
430	addi	rMEMP,rMEMP,32
431	cmplwi	cr1,rLEN,32
432	stw	rCHR,-8(rMEMP3)
433        stw     rCHR,-4(rMEMP3)
434L(cacheAligned2):
435	blt	cr1,L(handletail32)
436	addi	rLEN,rLEN,-32
437	stw	rCHR,0(rMEMP3)
438        stw     rCHR,4(rMEMP3)
439	stw	rCHR,8(rMEMP3)
440	stw     rCHR,12(rMEMP3)
441	addi	rMEMP,rMEMP,32
442	cmplwi	cr1,rLEN,32
443	stw	rCHR,16(rMEMP3)
444        stw     rCHR,20(rMEMP3)
445	stw	rCHR,24(rMEMP3)
446        stw     rCHR,28(rMEMP3)
447	nop
448L(cacheAligned3):
449	blt	cr1,L(handletail32)
450/* At this point we can overrun the store queue (pipe reject) so it is
451   time to slow things down. The store queue can merge two adjacent
452   stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
453   So we add "group ending nops" to guarantee that we dispatch only two
454   stores every other cycle. */
455	ori	r1,r1,0
456	ori	r1,r1,0
457	addi	rMEMP,rMEMP,32
458	addi	rLEN,rLEN,-32
459	ori	r1,r1,0
460	ori	r1,r1,0
461	stw	rCHR,32(rMEMP3)
462        stw     rCHR,36(rMEMP3)
463	ori	r1,r1,0
464	ori	r1,r1,0
465	stw	rCHR,40(rMEMP3)
466	stw     rCHR,44(rMEMP3)
467	ori	r1,r1,0
468	ori	r1,r1,0
469	stw	rCHR,48(rMEMP3)
470        stw     rCHR,52(rMEMP3)
471	ori	r1,r1,0
472	ori	r1,r1,0
473	stw	rCHR,56(rMEMP3)
474        stw     rCHR,60(rMEMP3)
475
476/* We are here because the length or remainder (rLEN) is less than the
477   cache line/sector size and does not justify aggressive loop unrolling.
478   So set up the preconditions for L(medium) and go there.  */
479        .align 3
480L(handletail32):
481	cmplwi	cr1,rLEN,0
482	beqlr   cr1
483	b	L(medium)
484
485	.align 4
486L(small):
487/* Memset of 4 bytes or less.  */
488	cmplwi	cr5, rLEN, 1
489	cmplwi	cr1, rLEN, 3
490	bltlr	cr5
491	stb	rCHR, 0(rMEMP)
492	beqlr	cr5
493	stb	rCHR, 1(rMEMP)
494	bltlr	cr1
495	stb	rCHR, 2(rMEMP)
496	beqlr	cr1
497	stb	rCHR, 3(rMEMP)
498	blr
499
500/* Memset of 0-31 bytes.  */
501	.align 5
502L(medium):
503	cmplwi	cr1, rLEN, 16
504L(medium_tail2):
505	add	rMEMP, rMEMP, rLEN
506L(medium_tail):
507	bt-	31, L(medium_31t)
508	bt-	30, L(medium_30t)
509L(medium_30f):
510	bt	29, L(medium_29t)
511L(medium_29f):
512	bge	cr1, L(medium_27t)
513	bflr	28
514        stw     rCHR, -4(rMEMP)
515	stw	rCHR, -8(rMEMP)
516	blr
517
518L(medium_31t):
519	stbu	rCHR, -1(rMEMP)
520	bf-	30, L(medium_30f)
521L(medium_30t):
522	sthu	rCHR, -2(rMEMP)
523	bf-	29, L(medium_29f)
524L(medium_29t):
525	stwu	rCHR, -4(rMEMP)
526	blt	cr1, L(medium_27f)
527L(medium_27t):
528        stw     rCHR, -4(rMEMP)
529	stw	rCHR, -8(rMEMP)
530        stw     rCHR, -12(rMEMP)
531	stwu	rCHR, -16(rMEMP)
532L(medium_27f):
533	bflr	28
534L(medium_28t):
535        stw     rCHR, -4(rMEMP)
536	stw	rCHR, -8(rMEMP)
537	blr
538END (memset)
539libc_hidden_builtin_def (memset)
540