1;! HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and 2;! add the result to a second limb vector. 3 4;! Copyright (C) 1992-2021 Free Software Foundation, Inc. 5 6;! This file is part of the GNU MP Library. 7 8;! The GNU MP Library is free software; you can redistribute it and/or modify 9;! it under the terms of the GNU Lesser General Public License as published by 10;! the Free Software Foundation; either version 2.1 of the License, or (at your 11;! option) any later version. 12 13;! The GNU MP Library is distributed in the hope that it will be useful, but 14;! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15;! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16;! License for more details. 17 18;! You should have received a copy of the GNU Lesser General Public License 19;! along with the GNU MP Library. If not, see 20;! <https://www.gnu.org/licenses/>. 21 22 23;! INPUT PARAMETERS 24;! res_ptr r26 25;! s1_ptr r25 26;! size r24 27;! s2_limb r23 28 29;! This runs at 11 cycles/limb on a PA7000. With the used instructions, it 30;! can not become faster due to data cache contention after a store. On the 31;! PA7100 it runs at 10 cycles/limb, and that can not be improved either, 32;! since only the xmpyu does not need the integer pipeline, so the only 33;! dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb 34;! on the PA7100. 35 36;! There are some ideas described in mul_1.s that applies to this code too. 37 38 .text 39 .export __mpn_addmul_1 40__mpn_addmul_1: 41 .proc 42 .callinfo frame=64,no_calls 43 .entry 44 45 ldo 64(%r30),%r30 46 fldws,ma 4(%r25),%fr5 47 stw %r23,-16(%r30) ;! move s2_limb ... 48 addib,= -1,%r24,L$just_one_limb 49 fldws -16(%r30),%fr4 ;! ... into fr4 50 add %r0,%r0,%r0 ;! clear carry 51 xmpyu %fr4,%fr5,%fr6 52 fldws,ma 4(%r25),%fr7 53 fstds %fr6,-16(%r30) 54 xmpyu %fr4,%fr7,%fr8 55 ldw -12(%r30),%r20 ;! least significant limb in product 56 ldw -16(%r30),%r28 57 58 fstds %fr8,-16(%r30) 59 addib,= -1,%r24,L$end 60 ldw -12(%r30),%r1 61 62;! Main loop 63L$loop: 64 ldws 0(%r26),%r29 65 fldws,ma 4(%r25),%fr5 66 add %r29,%r20,%r20 67 stws,ma %r20,4(%r26) 68 addc %r28,%r1,%r20 69 xmpyu %fr4,%fr5,%fr6 70 ldw -16(%r30),%r28 71 fstds %fr6,-16(%r30) 72 addc %r0,%r28,%r28 73 addib,<> -1,%r24,L$loop 74 ldw -12(%r30),%r1 75 76L$end: 77 ldw 0(%r26),%r29 78 add %r29,%r20,%r20 79 stws,ma %r20,4(%r26) 80 addc %r28,%r1,%r20 81 ldw -16(%r30),%r28 82 ldws 0(%r26),%r29 83 addc %r0,%r28,%r28 84 add %r29,%r20,%r20 85 stws,ma %r20,4(%r26) 86 addc %r0,%r28,%r28 87 bv 0(%r2) 88 ldo -64(%r30),%r30 89 90L$just_one_limb: 91 xmpyu %fr4,%fr5,%fr6 92 ldw 0(%r26),%r29 93 fstds %fr6,-16(%r30) 94 ldw -12(%r30),%r1 95 ldw -16(%r30),%r28 96 add %r29,%r1,%r20 97 stw %r20,0(%r26) 98 addc %r0,%r28,%r28 99 bv 0(%r2) 100 ldo -64(%r30),%r30 101 102 .exit 103 .procend 104