1;! HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and 2;! subtract the result from a second limb vector. 3 4;! Copyright (C) 1992-2021 Free Software Foundation, Inc. 5 6;! This file is part of the GNU MP Library. 7 8;! The GNU MP Library is free software; you can redistribute it and/or modify 9;! it under the terms of the GNU Lesser General Public License as published by 10;! the Free Software Foundation; either version 2.1 of the License, or (at your 11;! option) any later version. 12 13;! The GNU MP Library is distributed in the hope that it will be useful, but 14;! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15;! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16;! License for more details. 17 18;! You should have received a copy of the GNU Lesser General Public License 19;! along with the GNU MP Library. If not, see 20;! <https://www.gnu.org/licenses/>. 21 22 23;! INPUT PARAMETERS 24;! res_ptr r26 25;! s1_ptr r25 26;! size r24 27;! s2_limb r23 28 29;! This runs at 12 cycles/limb on a PA7000. With the used instructions, it 30;! can not become faster due to data cache contention after a store. On the 31;! PA7100 it runs at 11 cycles/limb, and that can not be improved either, 32;! since only the xmpyu does not need the integer pipeline, so the only 33;! dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb 34;! on the PA7100. 35 36;! There are some ideas described in mul_1.s that applies to this code too. 37 38;! It seems possible to make this run as fast as __mpn_addmul_1, if we use 39;! sub,>>= %r29,%r20,%r22 40;! addi 1,%r28,%r28 41;! but that requires reworking the hairy software pipeline... 42 43 .text 44 .export __mpn_submul_1 45__mpn_submul_1: 46 .proc 47 .callinfo frame=64,no_calls 48 .entry 49 50 ldo 64(%r30),%r30 51 fldws,ma 4(%r25),%fr5 52 stw %r23,-16(%r30) ;! move s2_limb ... 53 addib,= -1,%r24,L$just_one_limb 54 fldws -16(%r30),%fr4 ;! ... into fr4 55 add %r0,%r0,%r0 ;! clear carry 56 xmpyu %fr4,%fr5,%fr6 57 fldws,ma 4(%r25),%fr7 58 fstds %fr6,-16(%r30) 59 xmpyu %fr4,%fr7,%fr8 60 ldw -12(%r30),%r20 ;! least significant limb in product 61 ldw -16(%r30),%r28 62 63 fstds %fr8,-16(%r30) 64 addib,= -1,%r24,L$end 65 ldw -12(%r30),%r1 66 67;! Main loop 68L$loop: 69 ldws 0(%r26),%r29 70 fldws,ma 4(%r25),%fr5 71 sub %r29,%r20,%r22 72 add %r22,%r20,%r0 73 stws,ma %r22,4(%r26) 74 addc %r28,%r1,%r20 75 xmpyu %fr4,%fr5,%fr6 76 ldw -16(%r30),%r28 77 fstds %fr6,-16(%r30) 78 addc %r0,%r28,%r28 79 addib,<> -1,%r24,L$loop 80 ldw -12(%r30),%r1 81 82L$end: 83 ldw 0(%r26),%r29 84 sub %r29,%r20,%r22 85 add %r22,%r20,%r0 86 stws,ma %r22,4(%r26) 87 addc %r28,%r1,%r20 88 ldw -16(%r30),%r28 89 ldws 0(%r26),%r29 90 addc %r0,%r28,%r28 91 sub %r29,%r20,%r22 92 add %r22,%r20,%r0 93 stws,ma %r22,4(%r26) 94 addc %r0,%r28,%r28 95 bv 0(%r2) 96 ldo -64(%r30),%r30 97 98L$just_one_limb: 99 xmpyu %fr4,%fr5,%fr6 100 ldw 0(%r26),%r29 101 fstds %fr6,-16(%r30) 102 ldw -12(%r30),%r1 103 ldw -16(%r30),%r28 104 sub %r29,%r1,%r22 105 add %r22,%r1,%r0 106 stw %r22,0(%r26) 107 addc %r0,%r28,%r28 108 bv 0(%r2) 109 ldo -64(%r30),%r30 110 111 .exit 112 .procend 113