1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 # the result to a second limb vector.
3 #
4 #  Copyright (C) 2000-2021 Free Software Foundation, Inc.
5 #
6 #  This file is part of the GNU MP Library.
7 #
8 #  The GNU MP Library is free software; you can redistribute it and/or modify
9 #  it under the terms of the GNU Lesser General Public License as published
10 #  by the Free Software Foundation; either version 2.1 of the License, or (at
11 #  your option) any later version.
12 #
13 #  The GNU MP Library is distributed in the hope that it will be useful, but
14 #  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 #  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 #  License for more details.
17 #
18 #  You should have received a copy of the GNU Lesser General Public License
19 #  along with the GNU MP Library.  If not, see <https://www.gnu.org/licenses/>.
20
21 #  INPUT PARAMETERS
22 #  res_ptr	$16
23 #  s1_ptr	$17
24 #  size	$18
25 #  s2_limb	$19
26 #
27 #  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
28 #  exactly 3.625 cycles/limb on EV6...
29 #
30 # This code was written in close cooperation with ev6 pipeline expert
31 # Steve Root (root@toober.hlo.dec.com).  Any errors are tege's fault, though.
32 #
33 #   Register usages for unrolled loop:
34 #	  0-3     mul's
35 #	  4-7     acc's
36 #	  8-15    mul results
37 #	  20,21   carry's
38 #	  22,23   save for stores
39 #
40 #   Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
41 #
42 #   The stores can issue a cycle late so we have paired no-op's to 'catch'
43 #   them, so that further disturbance to the schedule is damped.
44 #
45 #   We couldn't pair the loads, because the entangled schedule of the
46 #   carry's has to happen on one side {0} of the machine. Note, the total
47 #   use of U0, and the total use of L0 (after attending to the stores).
48 #   which is part of the reason why....
49 #
50 #   This is a great schedule for the d_cache, a poor schedule for the
51 #   b_cache. The lockup on U0 means that any stall can't be recovered
52 #   from. Consider a ldq in L1.  say that load gets stalled because it
53 #   collides with a fill from the b_Cache. On the next cycle, this load
54 #   gets priority. If first looks at L0, and goes there. The instruction
55 #   we intended for L0 gets to look at L1, which is NOT where we want
56 #   it. It either stalls 1, because it can't go in L0, or goes there, and
57 #   causes a further instruction to stall.
58 #
59 #   So for b_cache, we're likely going to want to put one or more cycles
60 #   back into the code! And, of course, put in prefetches. For the
61 #   accumulator, lds, intent to modify.  For the multiplier, you might
62 #   want ldq, evict next, if you're not wanting to use it again soon. Use
63 #   256 ahead of present pointer value. At a place where we have an mt
64 #   followed by a bookkeeping, put the bookkeeping in upper, and the
65 #   prefetch into lower.
66 #
67 #   Note, the usage of physical registers per cycle is smoothed off, as
68 #   much as possible.
69 #
70 #   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
71 #   like not to have a ldq or stq to preceded a conditional branch in a
72 #   quadpack. The conditional branch moves the retire pointer one cycle
73 #   later.
74 #
75 #   Optimization notes:
76 #   Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
77 #   Reserved regs:	 $29 $30 $31
78 #   Free caller-saves regs in unrolled code: $24 $25 $28
79 #   We should swap some of the callee-saves regs for some of the free
80 #   caller-saves regs, saving some overhead cycles.
81 #   Most importantly, we should write fast code for the 0-7 case.
82 #   The code we use there are for the 21164, and runs at 7 cycles/limb
83 #   on the 21264.  Should not be hard, if we write specialized code for
84 #   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
85 #   need a jump table indexed by the low 3 bits of the count argument.
86
87	.set	noreorder
88	.set	noat
89	.text
90
91	.globl	__mpn_addmul_1
92	.ent	__mpn_addmul_1
93__mpn_addmul_1:
94	.frame	$30,0,$26,0
95	.prologue 0
96
97	cmpult	$18,	8,	$1
98	beq	$1,	$Large
99
100	ldq	$2,	0($17)		# $2 = s1_limb
101	addq	$17,	8,	$17	# s1_ptr++
102	subq	$18,	1,	$18	# size--
103	mulq	$2,	$19,	$3	# $3 = prod_low
104	ldq	$5,	0($16)		# $5 = *res_ptr
105	umulh	$2,	$19,	$0	# $0 = prod_high
106	beq	$18,	$Lend0b		# jump if size was == 1
107	ldq	$2,	0($17)		# $2 = s1_limb
108	addq	$17,	8,	$17	# s1_ptr++
109	subq	$18,	1,	$18	# size--
110	addq	$5,	$3,	$3
111	cmpult	$3,	$5,	$4
112	stq	$3,	0($16)
113	addq	$16,	8,	$16	# res_ptr++
114	beq	$18,	$Lend0a		# jump if size was == 2
115
116	.align 3
117$Loop0:	mulq	$2,	$19,	$3	# $3 = prod_low
118	ldq	$5,	0($16)		# $5 = *res_ptr
119	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
120	subq	$18,	1,	$18	# size--
121	umulh	$2,	$19,	$4	# $4 = cy_limb
122	ldq	$2,	0($17)		# $2 = s1_limb
123	addq	$17,	8,	$17	# s1_ptr++
124	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
125	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
126	addq	$5,	$3,	$3
127	cmpult	$3,	$5,	$5
128	stq	$3,	0($16)
129	addq	$16,	8,	$16	# res_ptr++
130	addq	$5,	$0,	$0	# combine carries
131	bne	$18,	$Loop0
132$Lend0a:
133	mulq	$2,	$19,	$3	# $3 = prod_low
134	ldq	$5,	0($16)		# $5 = *res_ptr
135	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
136	umulh	$2,	$19,	$4	# $4 = cy_limb
137	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
138	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
139	addq	$5,	$3,	$3
140	cmpult	$3,	$5,	$5
141	stq	$3,	0($16)
142	addq	$5,	$0,	$0	# combine carries
143	addq	$4,	$0,	$0	# cy_limb = prod_high + cy
144	ret	$31,	($26),	1
145$Lend0b:
146	addq	$5,	$3,	$3
147	cmpult	$3,	$5,	$5
148	stq	$3,	0($16)
149	addq	$0,	$5,	$0
150	ret	$31,	($26),	1
151
152$Large:
153	lda	$30,	-240($30)
154	stq	$9,	8($30)
155	stq	$10,	16($30)
156	stq	$11,	24($30)
157	stq	$12,	32($30)
158	stq	$13,	40($30)
159	stq	$14,	48($30)
160	stq	$15,	56($30)
161
162	and	$18,	7,	$20	# count for the first loop, 0-7
163	srl	$18,	3,	$18	# count for unrolled loop
164	bis	$31,	$31,	$0
165	beq	$20,	$Lunroll
166	ldq	$2,	0($17)		# $2 = s1_limb
167	addq	$17,	8,	$17	# s1_ptr++
168	subq	$20,	1,	$20	# size--
169	mulq	$2,	$19,	$3	# $3 = prod_low
170	ldq	$5,	0($16)		# $5 = *res_ptr
171	umulh	$2,	$19,	$0	# $0 = prod_high
172	beq	$20,	$Lend1b		# jump if size was == 1
173	ldq	$2,	0($17)		# $2 = s1_limb
174	addq	$17,	8,	$17	# s1_ptr++
175	subq	$20,	1,	$20	# size--
176	addq	$5,	$3,	$3
177	cmpult	$3,	$5,	$4
178	stq	$3,	0($16)
179	addq	$16,	8,	$16	# res_ptr++
180	beq	$20,	$Lend1a		# jump if size was == 2
181
182	.align 3
183$Loop1:	mulq	$2,	$19,	$3	# $3 = prod_low
184	ldq	$5,	0($16)		# $5 = *res_ptr
185	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
186	subq	$20,	1,	$20	# size--
187	umulh	$2,	$19,	$4	# $4 = cy_limb
188	ldq	$2,	0($17)		# $2 = s1_limb
189	addq	$17,	8,	$17	# s1_ptr++
190	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
191	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
192	addq	$5,	$3,	$3
193	cmpult	$3,	$5,	$5
194	stq	$3,	0($16)
195	addq	$16,	8,	$16	# res_ptr++
196	addq	$5,	$0,	$0	# combine carries
197	bne	$20,	$Loop1
198
199$Lend1a:
200	mulq	$2,	$19,	$3	# $3 = prod_low
201	ldq	$5,	0($16)		# $5 = *res_ptr
202	addq	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
203	umulh	$2,	$19,	$4	# $4 = cy_limb
204	addq	$3,	$0,	$3	# $3 = cy_limb + prod_low
205	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
206	addq	$5,	$3,	$3
207	cmpult	$3,	$5,	$5
208	stq	$3,	0($16)
209	addq	$16,	8,	$16	# res_ptr++
210	addq	$5,	$0,	$0	# combine carries
211	addq	$4,	$0,	$0	# cy_limb = prod_high + cy
212	br	$31,	$Lunroll
213$Lend1b:
214	addq	$5,	$3,	$3
215	cmpult	$3,	$5,	$5
216	stq	$3,	0($16)
217	addq	$16,	8,	$16	# res_ptr++
218	addq	$0,	$5,	$0
219
220$Lunroll:
221	lda	$17,	-16($17)	# L1 bookkeeping
222	lda	$16,	-16($16)	# L1 bookkeeping
223	bis	$0,	$31,	$12
224
225 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
226
227	ldq	$2,	16($17)		# L1
228	ldq	$3,	24($17)		# L1
229	lda	$18,	-1($18)		# L1 bookkeeping
230	ldq	$6,	16($16)		# L1
231	ldq	$7,	24($16)		# L1
232	ldq	$0,	32($17)		# L1
233	mulq	$19,	$2,	$13	# U1
234	ldq	$1,	40($17)		# L1
235	umulh	$19,	$2,	$14	# U1
236	mulq	$19,	$3,	$15	# U1
237	lda	$17,	64($17)		# L1 bookkeeping
238	ldq	$4,	32($16)		# L1
239	ldq	$5,	40($16)		# L1
240	umulh	$19,	$3,	$8	# U1
241	ldq	$2,	-16($17)	# L1
242	mulq	$19,	$0,	$9	# U1
243	ldq	$3,	-8($17)		# L1
244	umulh	$19,	$0,	$10	# U1
245	addq	$6,	$13,	$6	# L0 lo + acc
246	mulq	$19,	$1,	$11	# U1
247	cmpult	$6,	$13,	$20	# L0 lo add => carry
248	lda	$16,	64($16)		# L1 bookkeeping
249	addq	$6,	$12,	$22	# U0 hi add => answer
250	cmpult	$22,	$12,	$21	# L0 hi add => carry
251	addq	$14,	$20,	$14	# U0 hi mul + carry
252	ldq	$6,	-16($16)	# L1
253	addq	$7,	$15,	$23	# L0 lo + acc
254	addq	$14,	$21,	$14	# U0 hi mul + carry
255	ldq	$7,	-8($16)		# L1
256	umulh	$19,	$1,	$12	# U1
257	cmpult	$23,	$15,	$20	# L0 lo add => carry
258	addq	$23,	$14,	$23	# U0 hi add => answer
259	ldq	$0,	0($17)		# L1
260	mulq	$19,	$2,	$13	# U1
261	cmpult	$23,	$14,	$21	# L0 hi add => carry
262	addq	$8,	$20,	$8	# U0 hi mul + carry
263	ldq	$1,	8($17)		# L1
264	umulh	$19,	$2,	$14	# U1
265	addq	$4,	$9,	$4	# L0 lo + acc
266	stq	$22,	-48($16)	# L0
267	stq	$23,	-40($16)	# L1
268	mulq	$19,	$3,	$15	# U1
269	addq	$8,	$21,	$8	# U0 hi mul + carry
270	cmpult	$4,	$9,	$20	# L0 lo add => carry
271	addq	$4,	$8,	$22	# U0 hi add => answer
272	ble	$18,	$Lend		# U1 bookkeeping
273
274 # ____ MAIN UNROLLED LOOP ____
275	.align 4
276$Loop:
277	bis	$31,	$31,	$31	# U1 mt
278	cmpult	$22,	$8,	$21	# L0 hi add => carry
279	addq	$10,	$20,	$10	# U0 hi mul + carry
280	ldq	$4,	0($16)		# L1
281
282	bis	$31,	$31,	$31	# U1 mt
283	addq	$5,	$11,	$23	# L0 lo + acc
284	addq	$10,	$21,	$10	# L0 hi mul + carry
285	ldq	$5,	8($16)		# L1
286
287	umulh	$19,	$3,	$8	# U1
288	cmpult	$23,	$11,	$20	# L0 lo add => carry
289	addq	$23,	$10,	$23	# U0 hi add => answer
290	ldq	$2,	16($17)		# L1
291
292	mulq	$19,	$0,	$9	# U1
293	cmpult	$23,	$10,	$21	# L0 hi add => carry
294	addq	$12,	$20,	$12	# U0 hi mul + carry
295	ldq	$3,	24($17)		# L1
296
297	umulh	$19,	$0,	$10	# U1
298	addq	$6,	$13,	$6	# L0 lo + acc
299	stq	$22,	-32($16)	# L0
300	stq	$23,	-24($16)	# L1
301
302	bis	$31,	$31,	$31	# L0 st slosh
303	mulq	$19,	$1,	$11	# U1
304	bis	$31,	$31,	$31	# L1 st slosh
305	addq	$12,	$21,	$12	# U0 hi mul + carry
306
307	cmpult	$6,	$13,	$20	# L0 lo add => carry
308	bis	$31,	$31,	$31	# U1 mt
309	lda	$18,	-1($18)		# L1 bookkeeping
310	addq	$6,	$12,	$22	# U0 hi add => answer
311
312	bis	$31,	$31,	$31	# U1 mt
313	cmpult	$22,	$12,	$21	# L0 hi add => carry
314	addq	$14,	$20,	$14	# U0 hi mul + carry
315	ldq	$6,	16($16)		# L1
316
317	bis	$31,	$31,	$31	# U1 mt
318	addq	$7,	$15,	$23	# L0 lo + acc
319	addq	$14,	$21,	$14	# U0 hi mul + carry
320	ldq	$7,	24($16)		# L1
321
322	umulh	$19,	$1,	$12	# U1
323	cmpult	$23,	$15,	$20	# L0 lo add => carry
324	addq	$23,	$14,	$23	# U0 hi add => answer
325	ldq	$0,	32($17)		# L1
326
327	mulq	$19,	$2,	$13	# U1
328	cmpult	$23,	$14,	$21	# L0 hi add => carry
329	addq	$8,	$20,	$8	# U0 hi mul + carry
330	ldq	$1,	40($17)		# L1
331
332	umulh	$19,	$2,	$14	# U1
333	addq	$4,	$9,	$4	# U0 lo + acc
334	stq	$22,	-16($16)	# L0
335	stq	$23,	-8($16)		# L1
336
337	bis	$31,	$31,	$31	# L0 st slosh
338	mulq	$19,	$3,	$15	# U1
339	bis	$31,	$31,	$31	# L1 st slosh
340	addq	$8,	$21,	$8	# L0 hi mul + carry
341
342	cmpult	$4,	$9,	$20	# L0 lo add => carry
343	bis	$31,	$31,	$31	# U1 mt
344	lda	$17,	64($17)		# L1 bookkeeping
345	addq	$4,	$8,	$22	# U0 hi add => answer
346
347	bis	$31,	$31,	$31	# U1 mt
348	cmpult	$22,	$8,	$21	# L0 hi add => carry
349	addq	$10,	$20,	$10	# U0 hi mul + carry
350	ldq	$4,	32($16)		# L1
351
352	bis	$31,	$31,	$31	# U1 mt
353	addq	$5,	$11,	$23	# L0 lo + acc
354	addq	$10,	$21,	$10	# L0 hi mul + carry
355	ldq	$5,	40($16)		# L1
356
357	umulh	$19,	$3,	$8	# U1
358	cmpult	$23,	$11,	$20	# L0 lo add => carry
359	addq	$23,	$10,	$23	# U0 hi add => answer
360	ldq	$2,	-16($17)	# L1
361
362	mulq	$19,	$0,	$9	# U1
363	cmpult	$23,	$10,	$21	# L0 hi add => carry
364	addq	$12,	$20,	$12	# U0 hi mul + carry
365	ldq	$3,	-8($17)		# L1
366
367	umulh	$19,	$0,	$10	# U1
368	addq	$6,	$13,	$6	# L0 lo + acc
369	stq	$22,	0($16)		# L0
370	stq	$23,	8($16)		# L1
371
372	bis	$31,	$31,	$31	# L0 st slosh
373	mulq	$19,	$1,	$11	# U1
374	bis	$31,	$31,	$31	# L1 st slosh
375	addq	$12,	$21,	$12	# U0 hi mul + carry
376
377	cmpult	$6,	$13,	$20	# L0 lo add => carry
378	bis	$31,	$31,	$31	# U1 mt
379	lda	$16,	64($16)		# L1 bookkeeping
380	addq	$6,	$12,	$22	# U0 hi add => answer
381
382	bis	$31,	$31,	$31	# U1 mt
383	cmpult	$22,	$12,	$21	# L0 hi add => carry
384	addq	$14,	$20,	$14	# U0 hi mul + carry
385	ldq	$6,	-16($16)	# L1
386
387	bis	$31,	$31,	$31	# U1 mt
388	addq	$7,	$15,	$23	# L0 lo + acc
389	addq	$14,	$21,	$14	# U0 hi mul + carry
390	ldq	$7,	-8($16)		# L1
391
392	umulh	$19,	$1,	$12	# U1
393	cmpult	$23,	$15,	$20	# L0 lo add => carry
394	addq	$23,	$14,	$23	# U0 hi add => answer
395	ldq	$0,	0($17)		# L1
396
397	mulq	$19,	$2,	$13	# U1
398	cmpult	$23,	$14,	$21	# L0 hi add => carry
399	addq	$8,	$20,	$8	# U0 hi mul + carry
400	ldq	$1,	8($17)		# L1
401
402	umulh	$19,	$2,	$14	# U1
403	addq	$4,	$9,	$4	# L0 lo + acc
404	stq	$22,	-48($16)	# L0
405	stq	$23,	-40($16)	# L1
406
407	bis	$31,	$31,	$31	# L0 st slosh
408	mulq	$19,	$3,	$15	# U1
409	bis	$31,	$31,	$31	# L1 st slosh
410	addq	$8,	$21,	$8	# U0 hi mul + carry
411
412	cmpult	$4,	$9,	$20	# L0 lo add => carry
413	addq	$4,	$8,	$22	# U0 hi add => answer
414	bis	$31,	$31,	$31	# L1 mt
415	bgt	$18,	$Loop		# U1 bookkeeping
416
417# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
418$Lend:
419	cmpult	$22,	$8,	$21	# L0 hi add => carry
420	addq	$10,	$20,	$10	# U0 hi mul + carry
421	ldq	$4,	0($16)		# L1
422	addq	$5,	$11,	$23	# L0 lo + acc
423	addq	$10,	$21,	$10	# L0 hi mul + carry
424	ldq	$5,	8($16)		# L1
425	umulh	$19,	$3,	$8	# U1
426	cmpult	$23,	$11,	$20	# L0 lo add => carry
427	addq	$23,	$10,	$23	# U0 hi add => answer
428	mulq	$19,	$0,	$9	# U1
429	cmpult	$23,	$10,	$21	# L0 hi add => carry
430	addq	$12,	$20,	$12	# U0 hi mul + carry
431	umulh	$19,	$0,	$10	# U1
432	addq	$6,	$13,	$6	# L0 lo + acc
433	stq	$22,	-32($16)	# L0
434	stq	$23,	-24($16)	# L1
435	mulq	$19,	$1,	$11	# U1
436	addq	$12,	$21,	$12	# U0 hi mul + carry
437	cmpult	$6,	$13,	$20	# L0 lo add => carry
438	addq	$6,	$12,	$22	# U0 hi add => answer
439	cmpult	$22,	$12,	$21	# L0 hi add => carry
440	addq	$14,	$20,	$14	# U0 hi mul + carry
441	addq	$7,	$15,	$23	# L0 lo + acc
442	addq	$14,	$21,	$14	# U0 hi mul + carry
443	umulh	$19,	$1,	$12	# U1
444	cmpult	$23,	$15,	$20	# L0 lo add => carry
445	addq	$23,	$14,	$23	# U0 hi add => answer
446	cmpult	$23,	$14,	$21	# L0 hi add => carry
447	addq	$8,	$20,	$8	# U0 hi mul + carry
448	addq	$4,	$9,	$4	# U0 lo + acc
449	stq	$22,	-16($16)	# L0
450	stq	$23,	-8($16)		# L1
451	bis	$31,	$31,	$31	# L0 st slosh
452	addq	$8,	$21,	$8	# L0 hi mul + carry
453	cmpult	$4,	$9,	$20	# L0 lo add => carry
454	addq	$4,	$8,	$22	# U0 hi add => answer
455	cmpult	$22,	$8,	$21	# L0 hi add => carry
456	addq	$10,	$20,	$10	# U0 hi mul + carry
457	addq	$5,	$11,	$23	# L0 lo + acc
458	addq	$10,	$21,	$10	# L0 hi mul + carry
459	cmpult	$23,	$11,	$20	# L0 lo add => carry
460	addq	$23,	$10,	$23	# U0 hi add => answer
461	cmpult	$23,	$10,	$21	# L0 hi add => carry
462	addq	$12,	$20,	$12	# U0 hi mul + carry
463	stq	$22,	0($16)		# L0
464	stq	$23,	8($16)		# L1
465	addq	$12,	$21,	$0	# U0 hi mul + carry
466
467	ldq	$9,	8($30)
468	ldq	$10,	16($30)
469	ldq	$11,	24($30)
470	ldq	$12,	32($30)
471	ldq	$13,	40($30)
472	ldq	$14,	48($30)
473	ldq	$15,	56($30)
474	lda	$30,	240($30)
475	ret	$31,	($26),	1
476
477	.end	__mpn_addmul_1
478