1 # Alpha 21064 __mpn_lshift -- 2 3 # Copyright (C) 1994-2022 Free Software Foundation, Inc. 4 5 # This file is part of the GNU MP Library. 6 7 # The GNU MP Library is free software; you can redistribute it and/or modify 8 # it under the terms of the GNU Lesser General Public License as published by 9 # the Free Software Foundation; either version 2.1 of the License, or (at your 10 # option) any later version. 11 12 # The GNU MP Library is distributed in the hope that it will be useful, but 13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15 # License for more details. 16 17 # You should have received a copy of the GNU Lesser General Public License 18 # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>. 19 20 21 # INPUT PARAMETERS 22 # res_ptr r16 23 # s1_ptr r17 24 # size r18 25 # cnt r19 26 27 # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, 28 # it would take 4 cycles/limb. It should be possible to get down to 3 29 # cycles/limb since both ldq and stq can be paired with the other used 30 # instructions. But there are many restrictions in the 21064 pipeline that 31 # makes it hard, if not impossible, to get down to 3 cycles/limb: 32 33 # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. 34 # 2. Only aligned instruction pairs can be paired. 35 # 3. The store buffer or silo might not be able to deal with the bandwidth. 36 37 .set noreorder 38 .set noat 39.text 40 .align 3 41 .globl __mpn_lshift 42 .ent __mpn_lshift 43__mpn_lshift: 44 .frame $30,0,$26,0 45 46 s8addq $18,$17,$17 # make r17 point at end of s1 47 ldq $4,-8($17) # load first limb 48 subq $17,8,$17 49 subq $31,$19,$7 50 s8addq $18,$16,$16 # make r16 point at end of RES 51 subq $18,1,$18 52 and $18,4-1,$20 # number of limbs in first loop 53 srl $4,$7,$0 # compute function result 54 55 beq $20,.L0 56 subq $18,$20,$18 57 58 .align 3 59.Loop0: 60 ldq $3,-8($17) 61 subq $16,8,$16 62 subq $17,8,$17 63 subq $20,1,$20 64 sll $4,$19,$5 65 srl $3,$7,$6 66 bis $3,$3,$4 67 bis $5,$6,$8 68 stq $8,0($16) 69 bne $20,.Loop0 70 71.L0: beq $18,.Lend 72 73 .align 3 74.Loop: ldq $3,-8($17) 75 subq $16,32,$16 76 subq $18,4,$18 77 sll $4,$19,$5 78 srl $3,$7,$6 79 80 ldq $4,-16($17) 81 sll $3,$19,$1 82 bis $5,$6,$8 83 stq $8,24($16) 84 srl $4,$7,$2 85 86 ldq $3,-24($17) 87 sll $4,$19,$5 88 bis $1,$2,$8 89 stq $8,16($16) 90 srl $3,$7,$6 91 92 ldq $4,-32($17) 93 sll $3,$19,$1 94 bis $5,$6,$8 95 stq $8,8($16) 96 srl $4,$7,$2 97 98 subq $17,32,$17 99 bis $1,$2,$8 100 stq $8,0($16) 101 102 bgt $18,.Loop 103 104.Lend: sll $4,$19,$8 105 stq $8,-8($16) 106 ret $31,($26),1 107 .end __mpn_lshift 108