1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add 2 # the result to a second limb vector. 3 # 4 # Copyright (C) 2000-2021 Free Software Foundation, Inc. 5 # 6 # This file is part of the GNU MP Library. 7 # 8 # The GNU MP Library is free software; you can redistribute it and/or modify 9 # it under the terms of the GNU Lesser General Public License as published 10 # by the Free Software Foundation; either version 2.1 of the License, or (at 11 # your option) any later version. 12 # 13 # The GNU MP Library is distributed in the hope that it will be useful, but 14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 # License for more details. 17 # 18 # You should have received a copy of the GNU Lesser General Public License 19 # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>. 20 21 # INPUT PARAMETERS 22 # res_ptr $16 23 # s1_ptr $17 24 # size $18 25 # s2_limb $19 26 # 27 # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 28 # exactly 3.625 cycles/limb on EV6... 29 # 30 # This code was written in close cooperation with ev6 pipeline expert 31 # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. 32 # 33 # Register usages for unrolled loop: 34 # 0-3 mul's 35 # 4-7 acc's 36 # 8-15 mul results 37 # 20,21 carry's 38 # 22,23 save for stores 39 # 40 # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. 41 # 42 # The stores can issue a cycle late so we have paired no-op's to 'catch' 43 # them, so that further disturbance to the schedule is damped. 44 # 45 # We couldn't pair the loads, because the entangled schedule of the 46 # carry's has to happen on one side {0} of the machine. Note, the total 47 # use of U0, and the total use of L0 (after attending to the stores). 48 # which is part of the reason why.... 49 # 50 # This is a great schedule for the d_cache, a poor schedule for the 51 # b_cache. The lockup on U0 means that any stall can't be recovered 52 # from. Consider a ldq in L1. say that load gets stalled because it 53 # collides with a fill from the b_Cache. On the next cycle, this load 54 # gets priority. If first looks at L0, and goes there. The instruction 55 # we intended for L0 gets to look at L1, which is NOT where we want 56 # it. It either stalls 1, because it can't go in L0, or goes there, and 57 # causes a further instruction to stall. 58 # 59 # So for b_cache, we're likely going to want to put one or more cycles 60 # back into the code! And, of course, put in prefetches. For the 61 # accumulator, lds, intent to modify. For the multiplier, you might 62 # want ldq, evict next, if you're not wanting to use it again soon. Use 63 # 256 ahead of present pointer value. At a place where we have an mt 64 # followed by a bookkeeping, put the bookkeeping in upper, and the 65 # prefetch into lower. 66 # 67 # Note, the usage of physical registers per cycle is smoothed off, as 68 # much as possible. 69 # 70 # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd 71 # like not to have a ldq or stq to preceded a conditional branch in a 72 # quadpack. The conditional branch moves the retire pointer one cycle 73 # later. 74 # 75 # Optimization notes: 76 # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? 77 # Reserved regs: $29 $30 $31 78 # Free caller-saves regs in unrolled code: $24 $25 $28 79 # We should swap some of the callee-saves regs for some of the free 80 # caller-saves regs, saving some overhead cycles. 81 # Most importantly, we should write fast code for the 0-7 case. 82 # The code we use there are for the 21164, and runs at 7 cycles/limb 83 # on the 21264. Should not be hard, if we write specialized code for 84 # 1-7 limbs (the one for 0 limbs should be straightforward). We then just 85 # need a jump table indexed by the low 3 bits of the count argument. 86 87 .set noreorder 88 .set noat 89 .text 90 91 .globl __mpn_addmul_1 92 .ent __mpn_addmul_1 93__mpn_addmul_1: 94 .frame $30,0,$26,0 95 .prologue 0 96 97 cmpult $18, 8, $1 98 beq $1, $Large 99 100 ldq $2, 0($17) # $2 = s1_limb 101 addq $17, 8, $17 # s1_ptr++ 102 subq $18, 1, $18 # size-- 103 mulq $2, $19, $3 # $3 = prod_low 104 ldq $5, 0($16) # $5 = *res_ptr 105 umulh $2, $19, $0 # $0 = prod_high 106 beq $18, $Lend0b # jump if size was == 1 107 ldq $2, 0($17) # $2 = s1_limb 108 addq $17, 8, $17 # s1_ptr++ 109 subq $18, 1, $18 # size-- 110 addq $5, $3, $3 111 cmpult $3, $5, $4 112 stq $3, 0($16) 113 addq $16, 8, $16 # res_ptr++ 114 beq $18, $Lend0a # jump if size was == 2 115 116 .align 3 117$Loop0: mulq $2, $19, $3 # $3 = prod_low 118 ldq $5, 0($16) # $5 = *res_ptr 119 addq $4, $0, $0 # cy_limb = cy_limb + 'cy' 120 subq $18, 1, $18 # size-- 121 umulh $2, $19, $4 # $4 = cy_limb 122 ldq $2, 0($17) # $2 = s1_limb 123 addq $17, 8, $17 # s1_ptr++ 124 addq $3, $0, $3 # $3 = cy_limb + prod_low 125 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) 126 addq $5, $3, $3 127 cmpult $3, $5, $5 128 stq $3, 0($16) 129 addq $16, 8, $16 # res_ptr++ 130 addq $5, $0, $0 # combine carries 131 bne $18, $Loop0 132$Lend0a: 133 mulq $2, $19, $3 # $3 = prod_low 134 ldq $5, 0($16) # $5 = *res_ptr 135 addq $4, $0, $0 # cy_limb = cy_limb + 'cy' 136 umulh $2, $19, $4 # $4 = cy_limb 137 addq $3, $0, $3 # $3 = cy_limb + prod_low 138 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) 139 addq $5, $3, $3 140 cmpult $3, $5, $5 141 stq $3, 0($16) 142 addq $5, $0, $0 # combine carries 143 addq $4, $0, $0 # cy_limb = prod_high + cy 144 ret $31, ($26), 1 145$Lend0b: 146 addq $5, $3, $3 147 cmpult $3, $5, $5 148 stq $3, 0($16) 149 addq $0, $5, $0 150 ret $31, ($26), 1 151 152$Large: 153 lda $30, -240($30) 154 stq $9, 8($30) 155 stq $10, 16($30) 156 stq $11, 24($30) 157 stq $12, 32($30) 158 stq $13, 40($30) 159 stq $14, 48($30) 160 stq $15, 56($30) 161 162 and $18, 7, $20 # count for the first loop, 0-7 163 srl $18, 3, $18 # count for unrolled loop 164 bis $31, $31, $0 165 beq $20, $Lunroll 166 ldq $2, 0($17) # $2 = s1_limb 167 addq $17, 8, $17 # s1_ptr++ 168 subq $20, 1, $20 # size-- 169 mulq $2, $19, $3 # $3 = prod_low 170 ldq $5, 0($16) # $5 = *res_ptr 171 umulh $2, $19, $0 # $0 = prod_high 172 beq $20, $Lend1b # jump if size was == 1 173 ldq $2, 0($17) # $2 = s1_limb 174 addq $17, 8, $17 # s1_ptr++ 175 subq $20, 1, $20 # size-- 176 addq $5, $3, $3 177 cmpult $3, $5, $4 178 stq $3, 0($16) 179 addq $16, 8, $16 # res_ptr++ 180 beq $20, $Lend1a # jump if size was == 2 181 182 .align 3 183$Loop1: mulq $2, $19, $3 # $3 = prod_low 184 ldq $5, 0($16) # $5 = *res_ptr 185 addq $4, $0, $0 # cy_limb = cy_limb + 'cy' 186 subq $20, 1, $20 # size-- 187 umulh $2, $19, $4 # $4 = cy_limb 188 ldq $2, 0($17) # $2 = s1_limb 189 addq $17, 8, $17 # s1_ptr++ 190 addq $3, $0, $3 # $3 = cy_limb + prod_low 191 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) 192 addq $5, $3, $3 193 cmpult $3, $5, $5 194 stq $3, 0($16) 195 addq $16, 8, $16 # res_ptr++ 196 addq $5, $0, $0 # combine carries 197 bne $20, $Loop1 198 199$Lend1a: 200 mulq $2, $19, $3 # $3 = prod_low 201 ldq $5, 0($16) # $5 = *res_ptr 202 addq $4, $0, $0 # cy_limb = cy_limb + 'cy' 203 umulh $2, $19, $4 # $4 = cy_limb 204 addq $3, $0, $3 # $3 = cy_limb + prod_low 205 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) 206 addq $5, $3, $3 207 cmpult $3, $5, $5 208 stq $3, 0($16) 209 addq $16, 8, $16 # res_ptr++ 210 addq $5, $0, $0 # combine carries 211 addq $4, $0, $0 # cy_limb = prod_high + cy 212 br $31, $Lunroll 213$Lend1b: 214 addq $5, $3, $3 215 cmpult $3, $5, $5 216 stq $3, 0($16) 217 addq $16, 8, $16 # res_ptr++ 218 addq $0, $5, $0 219 220$Lunroll: 221 lda $17, -16($17) # L1 bookkeeping 222 lda $16, -16($16) # L1 bookkeeping 223 bis $0, $31, $12 224 225 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ 226 227 ldq $2, 16($17) # L1 228 ldq $3, 24($17) # L1 229 lda $18, -1($18) # L1 bookkeeping 230 ldq $6, 16($16) # L1 231 ldq $7, 24($16) # L1 232 ldq $0, 32($17) # L1 233 mulq $19, $2, $13 # U1 234 ldq $1, 40($17) # L1 235 umulh $19, $2, $14 # U1 236 mulq $19, $3, $15 # U1 237 lda $17, 64($17) # L1 bookkeeping 238 ldq $4, 32($16) # L1 239 ldq $5, 40($16) # L1 240 umulh $19, $3, $8 # U1 241 ldq $2, -16($17) # L1 242 mulq $19, $0, $9 # U1 243 ldq $3, -8($17) # L1 244 umulh $19, $0, $10 # U1 245 addq $6, $13, $6 # L0 lo + acc 246 mulq $19, $1, $11 # U1 247 cmpult $6, $13, $20 # L0 lo add => carry 248 lda $16, 64($16) # L1 bookkeeping 249 addq $6, $12, $22 # U0 hi add => answer 250 cmpult $22, $12, $21 # L0 hi add => carry 251 addq $14, $20, $14 # U0 hi mul + carry 252 ldq $6, -16($16) # L1 253 addq $7, $15, $23 # L0 lo + acc 254 addq $14, $21, $14 # U0 hi mul + carry 255 ldq $7, -8($16) # L1 256 umulh $19, $1, $12 # U1 257 cmpult $23, $15, $20 # L0 lo add => carry 258 addq $23, $14, $23 # U0 hi add => answer 259 ldq $0, 0($17) # L1 260 mulq $19, $2, $13 # U1 261 cmpult $23, $14, $21 # L0 hi add => carry 262 addq $8, $20, $8 # U0 hi mul + carry 263 ldq $1, 8($17) # L1 264 umulh $19, $2, $14 # U1 265 addq $4, $9, $4 # L0 lo + acc 266 stq $22, -48($16) # L0 267 stq $23, -40($16) # L1 268 mulq $19, $3, $15 # U1 269 addq $8, $21, $8 # U0 hi mul + carry 270 cmpult $4, $9, $20 # L0 lo add => carry 271 addq $4, $8, $22 # U0 hi add => answer 272 ble $18, $Lend # U1 bookkeeping 273 274 # ____ MAIN UNROLLED LOOP ____ 275 .align 4 276$Loop: 277 bis $31, $31, $31 # U1 mt 278 cmpult $22, $8, $21 # L0 hi add => carry 279 addq $10, $20, $10 # U0 hi mul + carry 280 ldq $4, 0($16) # L1 281 282 bis $31, $31, $31 # U1 mt 283 addq $5, $11, $23 # L0 lo + acc 284 addq $10, $21, $10 # L0 hi mul + carry 285 ldq $5, 8($16) # L1 286 287 umulh $19, $3, $8 # U1 288 cmpult $23, $11, $20 # L0 lo add => carry 289 addq $23, $10, $23 # U0 hi add => answer 290 ldq $2, 16($17) # L1 291 292 mulq $19, $0, $9 # U1 293 cmpult $23, $10, $21 # L0 hi add => carry 294 addq $12, $20, $12 # U0 hi mul + carry 295 ldq $3, 24($17) # L1 296 297 umulh $19, $0, $10 # U1 298 addq $6, $13, $6 # L0 lo + acc 299 stq $22, -32($16) # L0 300 stq $23, -24($16) # L1 301 302 bis $31, $31, $31 # L0 st slosh 303 mulq $19, $1, $11 # U1 304 bis $31, $31, $31 # L1 st slosh 305 addq $12, $21, $12 # U0 hi mul + carry 306 307 cmpult $6, $13, $20 # L0 lo add => carry 308 bis $31, $31, $31 # U1 mt 309 lda $18, -1($18) # L1 bookkeeping 310 addq $6, $12, $22 # U0 hi add => answer 311 312 bis $31, $31, $31 # U1 mt 313 cmpult $22, $12, $21 # L0 hi add => carry 314 addq $14, $20, $14 # U0 hi mul + carry 315 ldq $6, 16($16) # L1 316 317 bis $31, $31, $31 # U1 mt 318 addq $7, $15, $23 # L0 lo + acc 319 addq $14, $21, $14 # U0 hi mul + carry 320 ldq $7, 24($16) # L1 321 322 umulh $19, $1, $12 # U1 323 cmpult $23, $15, $20 # L0 lo add => carry 324 addq $23, $14, $23 # U0 hi add => answer 325 ldq $0, 32($17) # L1 326 327 mulq $19, $2, $13 # U1 328 cmpult $23, $14, $21 # L0 hi add => carry 329 addq $8, $20, $8 # U0 hi mul + carry 330 ldq $1, 40($17) # L1 331 332 umulh $19, $2, $14 # U1 333 addq $4, $9, $4 # U0 lo + acc 334 stq $22, -16($16) # L0 335 stq $23, -8($16) # L1 336 337 bis $31, $31, $31 # L0 st slosh 338 mulq $19, $3, $15 # U1 339 bis $31, $31, $31 # L1 st slosh 340 addq $8, $21, $8 # L0 hi mul + carry 341 342 cmpult $4, $9, $20 # L0 lo add => carry 343 bis $31, $31, $31 # U1 mt 344 lda $17, 64($17) # L1 bookkeeping 345 addq $4, $8, $22 # U0 hi add => answer 346 347 bis $31, $31, $31 # U1 mt 348 cmpult $22, $8, $21 # L0 hi add => carry 349 addq $10, $20, $10 # U0 hi mul + carry 350 ldq $4, 32($16) # L1 351 352 bis $31, $31, $31 # U1 mt 353 addq $5, $11, $23 # L0 lo + acc 354 addq $10, $21, $10 # L0 hi mul + carry 355 ldq $5, 40($16) # L1 356 357 umulh $19, $3, $8 # U1 358 cmpult $23, $11, $20 # L0 lo add => carry 359 addq $23, $10, $23 # U0 hi add => answer 360 ldq $2, -16($17) # L1 361 362 mulq $19, $0, $9 # U1 363 cmpult $23, $10, $21 # L0 hi add => carry 364 addq $12, $20, $12 # U0 hi mul + carry 365 ldq $3, -8($17) # L1 366 367 umulh $19, $0, $10 # U1 368 addq $6, $13, $6 # L0 lo + acc 369 stq $22, 0($16) # L0 370 stq $23, 8($16) # L1 371 372 bis $31, $31, $31 # L0 st slosh 373 mulq $19, $1, $11 # U1 374 bis $31, $31, $31 # L1 st slosh 375 addq $12, $21, $12 # U0 hi mul + carry 376 377 cmpult $6, $13, $20 # L0 lo add => carry 378 bis $31, $31, $31 # U1 mt 379 lda $16, 64($16) # L1 bookkeeping 380 addq $6, $12, $22 # U0 hi add => answer 381 382 bis $31, $31, $31 # U1 mt 383 cmpult $22, $12, $21 # L0 hi add => carry 384 addq $14, $20, $14 # U0 hi mul + carry 385 ldq $6, -16($16) # L1 386 387 bis $31, $31, $31 # U1 mt 388 addq $7, $15, $23 # L0 lo + acc 389 addq $14, $21, $14 # U0 hi mul + carry 390 ldq $7, -8($16) # L1 391 392 umulh $19, $1, $12 # U1 393 cmpult $23, $15, $20 # L0 lo add => carry 394 addq $23, $14, $23 # U0 hi add => answer 395 ldq $0, 0($17) # L1 396 397 mulq $19, $2, $13 # U1 398 cmpult $23, $14, $21 # L0 hi add => carry 399 addq $8, $20, $8 # U0 hi mul + carry 400 ldq $1, 8($17) # L1 401 402 umulh $19, $2, $14 # U1 403 addq $4, $9, $4 # L0 lo + acc 404 stq $22, -48($16) # L0 405 stq $23, -40($16) # L1 406 407 bis $31, $31, $31 # L0 st slosh 408 mulq $19, $3, $15 # U1 409 bis $31, $31, $31 # L1 st slosh 410 addq $8, $21, $8 # U0 hi mul + carry 411 412 cmpult $4, $9, $20 # L0 lo add => carry 413 addq $4, $8, $22 # U0 hi add => answer 414 bis $31, $31, $31 # L1 mt 415 bgt $18, $Loop # U1 bookkeeping 416 417# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ 418$Lend: 419 cmpult $22, $8, $21 # L0 hi add => carry 420 addq $10, $20, $10 # U0 hi mul + carry 421 ldq $4, 0($16) # L1 422 addq $5, $11, $23 # L0 lo + acc 423 addq $10, $21, $10 # L0 hi mul + carry 424 ldq $5, 8($16) # L1 425 umulh $19, $3, $8 # U1 426 cmpult $23, $11, $20 # L0 lo add => carry 427 addq $23, $10, $23 # U0 hi add => answer 428 mulq $19, $0, $9 # U1 429 cmpult $23, $10, $21 # L0 hi add => carry 430 addq $12, $20, $12 # U0 hi mul + carry 431 umulh $19, $0, $10 # U1 432 addq $6, $13, $6 # L0 lo + acc 433 stq $22, -32($16) # L0 434 stq $23, -24($16) # L1 435 mulq $19, $1, $11 # U1 436 addq $12, $21, $12 # U0 hi mul + carry 437 cmpult $6, $13, $20 # L0 lo add => carry 438 addq $6, $12, $22 # U0 hi add => answer 439 cmpult $22, $12, $21 # L0 hi add => carry 440 addq $14, $20, $14 # U0 hi mul + carry 441 addq $7, $15, $23 # L0 lo + acc 442 addq $14, $21, $14 # U0 hi mul + carry 443 umulh $19, $1, $12 # U1 444 cmpult $23, $15, $20 # L0 lo add => carry 445 addq $23, $14, $23 # U0 hi add => answer 446 cmpult $23, $14, $21 # L0 hi add => carry 447 addq $8, $20, $8 # U0 hi mul + carry 448 addq $4, $9, $4 # U0 lo + acc 449 stq $22, -16($16) # L0 450 stq $23, -8($16) # L1 451 bis $31, $31, $31 # L0 st slosh 452 addq $8, $21, $8 # L0 hi mul + carry 453 cmpult $4, $9, $20 # L0 lo add => carry 454 addq $4, $8, $22 # U0 hi add => answer 455 cmpult $22, $8, $21 # L0 hi add => carry 456 addq $10, $20, $10 # U0 hi mul + carry 457 addq $5, $11, $23 # L0 lo + acc 458 addq $10, $21, $10 # L0 hi mul + carry 459 cmpult $23, $11, $20 # L0 lo add => carry 460 addq $23, $10, $23 # U0 hi add => answer 461 cmpult $23, $10, $21 # L0 hi add => carry 462 addq $12, $20, $12 # U0 hi mul + carry 463 stq $22, 0($16) # L0 464 stq $23, 8($16) # L1 465 addq $12, $21, $0 # U0 hi mul + carry 466 467 ldq $9, 8($30) 468 ldq $10, 16($30) 469 ldq $11, 24($30) 470 ldq $12, 32($30) 471 ldq $13, 40($30) 472 ldq $14, 48($30) 473 ldq $15, 56($30) 474 lda $30, 240($30) 475 ret $31, ($26), 1 476 477 .end __mpn_addmul_1 478