1 # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store 2 # the result in a second limb vector. 3 4 # Copyright (C) 1992-2021 Free Software Foundation, Inc. 5 6 # This file is part of the GNU MP Library. 7 8 # The GNU MP Library is free software; you can redistribute it and/or modify 9 # it under the terms of the GNU Lesser General Public License as published by 10 # the Free Software Foundation; either version 2.1 of the License, or (at your 11 # option) any later version. 12 13 # The GNU MP Library is distributed in the hope that it will be useful, but 14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 # License for more details. 17 18 # You should have received a copy of the GNU Lesser General Public License 19 # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>. 20 21 22 # INPUT PARAMETERS 23 # res_ptr r16 24 # s1_ptr r17 25 # size r18 26 # s2_limb r19 27 28 # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. 29 30 # To improve performance for long multiplications, we would use 31 # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use 32 # these instructions without slowing down the general code: 1. We can 33 # only have two prefetches in operation at any time in the Alpha 34 # architecture. 2. There will seldom be any special alignment 35 # between RES_PTR and S1_PTR. Maybe we can simply divide the current 36 # loop into an inner and outer loop, having the inner loop handle 37 # exactly one prefetch block? 38 39 .set noreorder 40 .set noat 41.text 42 .align 3 43 .globl __mpn_mul_1 44 .ent __mpn_mul_1 2 45__mpn_mul_1: 46 .frame $30,0,$26 47 48 ldq $2,0($17) # $2 = s1_limb 49 subq $18,1,$18 # size-- 50 mulq $2,$19,$3 # $3 = prod_low 51 bic $31,$31,$4 # clear cy_limb 52 umulh $2,$19,$0 # $0 = prod_high 53 beq $18,Lend1 # jump if size was == 1 54 ldq $2,8($17) # $2 = s1_limb 55 subq $18,1,$18 # size-- 56 stq $3,0($16) 57 beq $18,Lend2 # jump if size was == 2 58 59 .align 3 60Loop: mulq $2,$19,$3 # $3 = prod_low 61 addq $4,$0,$0 # cy_limb = cy_limb + 'cy' 62 subq $18,1,$18 # size-- 63 umulh $2,$19,$4 # $4 = cy_limb 64 ldq $2,16($17) # $2 = s1_limb 65 addq $17,8,$17 # s1_ptr++ 66 addq $3,$0,$3 # $3 = cy_limb + prod_low 67 stq $3,8($16) 68 cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) 69 addq $16,8,$16 # res_ptr++ 70 bne $18,Loop 71 72Lend2: mulq $2,$19,$3 # $3 = prod_low 73 addq $4,$0,$0 # cy_limb = cy_limb + 'cy' 74 umulh $2,$19,$4 # $4 = cy_limb 75 addq $3,$0,$3 # $3 = cy_limb + prod_low 76 cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) 77 stq $3,8($16) 78 addq $4,$0,$0 # cy_limb = prod_high + cy 79 ret $31,($26),1 80Lend1: stq $3,0($16) 81 ret $31,($26),1 82 83 .end __mpn_mul_1 84