1/* Optimized version of the memccpy() function.
2   This file is part of the GNU C Library.
3   Copyright (C) 2000-2021 Free Software Foundation, Inc.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* Return: a pointer to the next byte after char in dest or NULL
20
21   Inputs:
22        in0:    dest
23        in1:    src
24        in2:    char
25        in3:    byte count
26
27   This implementation assumes little endian mode (UM.be = 0).
28
29   This implementation assumes that it is safe to do read ahead
30   in the src block, without getting beyond its limit.  */
31
32#include <sysdep.h>
33#undef ret
34
35#define OP_T_THRES 	16
36#define OPSIZ 		8
37
38#define saved_pr	r17
39#define saved_lc	r18
40#define dest		r19
41#define src		r20
42#define len		r21
43#define asrc		r22
44#define tmp		r23
45#define char		r24
46#define charx8		r25
47#define saved_ec	r26
48#define sh2		r28
49#define	sh1		r29
50#define loopcnt		r30
51#define	value		r31
52
53#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
54/* Manually force proper loop-alignment.  Note: be sure to
55   double-check the code-layout after making any changes to
56   this routine! */
57# define ALIGN(n)	{ nop 0 }
58#else
59# define ALIGN(n)	.align n
60#endif
61
62ENTRY(memccpy)
63	.prologue
64	alloc 	r2 = ar.pfs, 4, 40 - 4, 0, 40
65
66#include "softpipe.h"
67	.rotr	r[MEMLAT + 7], tmp1[4], tmp2[4], val[4], tmp3[2], pos0[2]
68	.rotp	p[MEMLAT + 6 + 1]
69
70	mov	ret0 = r0		// return NULL if no match
71	.save pr, saved_pr
72	mov	saved_pr = pr		// save the predicate registers
73	mov 	dest = in0		// dest
74	.save ar.lc, saved_lc
75        mov 	saved_lc = ar.lc	// save the loop counter
76        mov 	saved_ec = ar.ec	// save the loop counter
77	.body
78	mov 	src = in1		// src
79	extr.u	char = in2, 0, 8	// char
80	mov	len = in3		// len
81	sub	tmp = r0, in0		// tmp = -dest
82	cmp.ne	p7, p0 = r0, r0		// clear p7
83	;;
84	and	loopcnt = 7, tmp	// loopcnt = -dest % 8
85	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
86	mov	ar.ec = 0		// ec not guaranteed zero on entry
87(p6)	br.cond.spnt	.cpyfew		// copy byte by byte
88	;;
89	cmp.eq	p6, p0 = loopcnt, r0
90	mux1	charx8 = char, @brcst
91(p6)	br.cond.sptk .dest_aligned
92	sub	len = len, loopcnt	// len -= -dest % 8
93	adds	loopcnt = -1, loopcnt	// --loopcnt
94	;;
95	mov	ar.lc = loopcnt
96.l1:					// copy -dest % 8 bytes
97	ld1	value = [src], 1	// value = *src++
98	;;
99	st1	[dest] = value, 1	// *dest++ = value
100	cmp.eq	p6, p0 = value, char
101(p6)	br.cond.spnt .foundit
102	br.cloop.dptk .l1
103.dest_aligned:
104	and	sh1 = 7, src 		// sh1 = src % 8
105	and	tmp = -8, len   	// tmp = len & -OPSIZ
106	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
107	shr.u	loopcnt = len, 3	// loopcnt = len / 8
108	and	len = 7, len ;;		// len = len % 8
109	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
110	adds	loopcnt = -1, loopcnt	// --loopcnt
111	mov     pr.rot = 1 << 16 ;;	// set rotating predicates
112	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
113	mov	ar.lc = loopcnt		// set LC
114	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
115(p6)    br.cond.sptk .src_aligned ;;
116	add	src = src, tmp		// src += len & -OPSIZ
117	mov	ar.ec = MEMLAT + 6 + 1 	// six more passes needed
118	ld8	r[1] = [asrc], 8 	// r[1] = w0
119	cmp.ne	p6, p0 = r0, r0	;;	// clear p6
120	ALIGN(32)
121.l2:
122(p[0])		ld8.s	r[0] = [asrc], 8		// r[0] = w1
123(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	// tmp1 = w0 >> sh1
124(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2  	// tmp2 = w1 << sh2
125(p[MEMLAT+4])	xor	tmp3[0] = val[1], charx8
126(p[MEMLAT+5])	czx1.r	pos0[0] = tmp3[1]
127(p[MEMLAT+6])	chk.s	r[6 + MEMLAT], .recovery1	// our data isn't
128							// valid - rollback!
129(p[MEMLAT+6])	cmp.ne	p6, p0 = 8, pos0[1]
130(p6)		br.cond.spnt	.gotit
131(p[MEMLAT+6])	st8	[dest] = val[3], 8		// store val to dest
132(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3] 	// val = tmp1 | tmp2
133		br.ctop.sptk    .l2
134		br.cond.sptk .cpyfew
135
136.src_aligned:
137		cmp.ne  p6, p0 = r0, r0			// clear p6
138		mov     ar.ec = MEMLAT + 2 + 1 ;;	// set EC
139.l3:
140(p[0])		ld8.s	r[0] = [src], 8
141(p[MEMLAT])	xor	tmp3[0] = r[MEMLAT], charx8
142(p[MEMLAT+1])	czx1.r	pos0[0] = tmp3[1]
143(p[MEMLAT+2])	cmp.ne	p7, p0 = 8, pos0[1]
144(p[MEMLAT+2])	chk.s	r[MEMLAT+2], .recovery2
145(p7)		br.cond.spnt	.gotit
146.back2:
147(p[MEMLAT+2])	st8	[dest] = r[MEMLAT+2], 8
148		br.ctop.dptk .l3
149.cpyfew:
150	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
151	adds	len = -1, len		// --len;
152(p6)	br.cond.spnt	.restore_and_exit ;;
153	mov	ar.lc = len
154.l4:
155	ld1	value = [src], 1
156	;;
157	st1	[dest] = value, 1
158	cmp.eq	p6, p0 = value, char
159(p6)	br.cond.spnt .foundit
160	br.cloop.dptk	.l4 ;;
161.foundit:
162(p6)	mov	ret0 = dest
163.restore_and_exit:
164	mov     pr = saved_pr, -1    	// restore the predicate registers
165	mov 	ar.lc = saved_lc	// restore the loop counter
166	mov 	ar.ec = saved_ec ;;	// restore the epilog counter
167	br.ret.sptk.many b0
168.gotit:
169	.pred.rel "mutex" p6, p7
170(p6)	mov	value = val[3]		// if coming from l2
171(p7)	mov	value = r[MEMLAT+2]	// if coming from l3
172	mov	ar.lc = pos0[1] ;;
173.l5:
174	extr.u	tmp = value, 0, 8 ;;
175	st1	[dest] = tmp, 1
176	shr.u	value = value, 8
177	br.cloop.sptk .l5 ;;
178	mov 	ret0 = dest
179	mov	pr = saved_pr, -1
180	mov	ar.lc = saved_lc
181	br.ret.sptk.many b0
182
183.recovery1:
184#if MEMLAT != 6
185# error "MEMLAT must be 6!"
186#endif
187	adds	src = -8, asrc
188	mov	loopcnt = ar.lc
189	mov	tmp = ar.ec
190	;;
191(p[0])	adds	src = -8, src
192	;;
193(p[1])	adds	src = -8, src
194	sub	sh1 = (MEMLAT + 6 + 1), tmp
195	;;
196(p[2])	adds	src = -8, src
197	;;
198(p[3])	adds	src = -8, src
199	shl	loopcnt = loopcnt, 3
200	;;
201(p[4])	adds	src = -8, src
202	;;
203(p[5])	adds	src = -8, src
204	shl	sh1 = sh1, 3
205	;;
206(p[6])	adds	src = -8, src
207	;;
208(p[7])	adds	src = -8, src
209	shl	tmp = tmp, 3
210	;;
211(p[8])	adds	src = -8, src
212	;;
213(p[9])	adds	src = -8, src
214	shr.u	sh2 = sh2, 3
215	;;
216(p[10])	adds	src = -8, src
217	;;
218(p[11])	adds	src = -8, src
219	add	len = len, loopcnt
220	;;
221	sub	src = src, sh2
222	;;
223	add	len = tmp, len
224	add	src = sh1, src
225	br.cond.sptk .cpyfew
226
227.recovery2:
228#if MEMLAT != 6
229# error "MEMLAT must be 6!"
230#endif
231	add	tmp = -8, src
232(p7)	br.cond.spnt .gotit
233	;;
234(p[0])	add	tmp = -8, tmp ;;
235(p[1])	add	tmp = -8, tmp ;;
236(p[2])	add	tmp = -8, tmp ;;
237(p[3])	add	tmp = -8, tmp ;;
238(p[4])	add	tmp = -8, tmp ;;
239(p[5])	add	tmp = -8, tmp ;;
240(p[6])	add	tmp = -8, tmp ;;
241(p[7])	add	tmp = -8, tmp ;;
242	ld8	r[MEMLAT+2] = [tmp] ;;
243	xor	pos0[1] = r[MEMLAT+2], charx8 ;;
244	czx1.r	pos0[1] = pos0[1] ;;
245	cmp.ne	p7, p6 = 8, pos0[1]
246(p7)	br.cond.spnt .gotit
247	br.cond.sptk .back2
248END(memccpy)
249