1/* Pentium optimized __mpn_rshift -- 2 Copyright (C) 1992-2022 Free Software Foundation, Inc. 3 This file is part of the GNU MP Library. 4 5 The GNU MP Library is free software; you can redistribute it and/or modify 6 it under the terms of the GNU Lesser General Public License as published by 7 the Free Software Foundation; either version 2.1 of the License, or (at your 8 option) any later version. 9 10 The GNU MP Library is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 12 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 13 License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with the GNU MP Library; see the file COPYING.LIB. If not, 17 see <https://www.gnu.org/licenses/>. */ 18 19#include "sysdep.h" 20#include "asm-syntax.h" 21 22#define PARMS 4+16 /* space for 4 saved regs */ 23#define RES PARMS 24#define S RES+4 25#define SIZE S+4 26#define CNT SIZE+4 27 28 .text 29ENTRY (__mpn_rshift) 30 31 pushl %edi 32 cfi_adjust_cfa_offset (4) 33 pushl %esi 34 cfi_adjust_cfa_offset (4) 35 pushl %ebp 36 cfi_adjust_cfa_offset (4) 37 cfi_rel_offset (ebp, 0) 38 pushl %ebx 39 cfi_adjust_cfa_offset (4) 40 41 movl RES(%esp),%edi 42 cfi_rel_offset (edi, 12) 43 movl S(%esp),%esi 44 cfi_rel_offset (esi, 8) 45 movl SIZE(%esp),%ebx 46 cfi_rel_offset (ebx, 0) 47 movl CNT(%esp),%ecx 48 49/* We can use faster code for shift-by-1 under certain conditions. */ 50 cmp $1,%ecx 51 jne L(normal) 52 leal 4(%edi),%eax 53 cmpl %esi,%eax 54 jnc L(special) /* jump if res_ptr + 1 >= s_ptr */ 55 leal (%edi,%ebx,4),%eax 56 cmpl %eax,%esi 57 jnc L(special) /* jump if s_ptr >= res_ptr + size */ 58 59L(normal): 60 movl (%esi),%edx 61 addl $4,%esi 62 xorl %eax,%eax 63 shrdl %cl,%edx,%eax /* compute carry limb */ 64 pushl %eax /* push carry limb onto stack */ 65 cfi_adjust_cfa_offset (4) 66 67 decl %ebx 68 pushl %ebx 69 cfi_adjust_cfa_offset (4) 70 shrl $3,%ebx 71 jz L(end) 72 73 movl (%edi),%eax /* fetch destination cache line */ 74 75 ALIGN (2) 76L(oop): movl 28(%edi),%eax /* fetch destination cache line */ 77 movl %edx,%ebp 78 79 movl (%esi),%eax 80 movl 4(%esi),%edx 81 shrdl %cl,%eax,%ebp 82 shrdl %cl,%edx,%eax 83 movl %ebp,(%edi) 84 movl %eax,4(%edi) 85 86 movl 8(%esi),%ebp 87 movl 12(%esi),%eax 88 shrdl %cl,%ebp,%edx 89 shrdl %cl,%eax,%ebp 90 movl %edx,8(%edi) 91 movl %ebp,12(%edi) 92 93 movl 16(%esi),%edx 94 movl 20(%esi),%ebp 95 shrdl %cl,%edx,%eax 96 shrdl %cl,%ebp,%edx 97 movl %eax,16(%edi) 98 movl %edx,20(%edi) 99 100 movl 24(%esi),%eax 101 movl 28(%esi),%edx 102 shrdl %cl,%eax,%ebp 103 shrdl %cl,%edx,%eax 104 movl %ebp,24(%edi) 105 movl %eax,28(%edi) 106 107 addl $32,%esi 108 addl $32,%edi 109 decl %ebx 110 jnz L(oop) 111 112L(end): popl %ebx 113 cfi_adjust_cfa_offset (-4) 114 andl $7,%ebx 115 jz L(end2) 116L(oop2): 117 movl (%esi),%eax 118 shrdl %cl,%eax,%edx /* compute result limb */ 119 movl %edx,(%edi) 120 movl %eax,%edx 121 addl $4,%esi 122 addl $4,%edi 123 decl %ebx 124 jnz L(oop2) 125 126L(end2): 127 shrl %cl,%edx /* compute most significant limb */ 128 movl %edx,(%edi) /* store it */ 129 130 popl %eax /* pop carry limb */ 131 cfi_adjust_cfa_offset (-4) 132 133 popl %ebx 134 cfi_adjust_cfa_offset (-4) 135 cfi_restore (ebx) 136 popl %ebp 137 cfi_adjust_cfa_offset (-4) 138 cfi_restore (ebp) 139 popl %esi 140 cfi_adjust_cfa_offset (-4) 141 cfi_restore (esi) 142 popl %edi 143 cfi_adjust_cfa_offset (-4) 144 cfi_restore (edi) 145 146 ret 147 148/* We loop from least significant end of the arrays, which is only 149 permissible if the source and destination don't overlap, since the 150 function is documented to work for overlapping source and destination. 151*/ 152 153 cfi_adjust_cfa_offset (16) 154 cfi_rel_offset (edi, 12) 155 cfi_rel_offset (esi, 8) 156 cfi_rel_offset (ebp, 4) 157 cfi_rel_offset (ebx, 0) 158L(special): 159 leal -4(%edi,%ebx,4),%edi 160 leal -4(%esi,%ebx,4),%esi 161 162 movl (%esi),%edx 163 subl $4,%esi 164 165 decl %ebx 166 pushl %ebx 167 cfi_adjust_cfa_offset (4) 168 shrl $3,%ebx 169 170 shrl $1,%edx 171 incl %ebx 172 decl %ebx 173 jz L(Lend) 174 175 movl (%edi),%eax /* fetch destination cache line */ 176 177 ALIGN (2) 178L(Loop): 179 movl -28(%edi),%eax /* fetch destination cache line */ 180 movl %edx,%ebp 181 182 movl (%esi),%eax 183 movl -4(%esi),%edx 184 rcrl $1,%eax 185 movl %ebp,(%edi) 186 rcrl $1,%edx 187 movl %eax,-4(%edi) 188 189 movl -8(%esi),%ebp 190 movl -12(%esi),%eax 191 rcrl $1,%ebp 192 movl %edx,-8(%edi) 193 rcrl $1,%eax 194 movl %ebp,-12(%edi) 195 196 movl -16(%esi),%edx 197 movl -20(%esi),%ebp 198 rcrl $1,%edx 199 movl %eax,-16(%edi) 200 rcrl $1,%ebp 201 movl %edx,-20(%edi) 202 203 movl -24(%esi),%eax 204 movl -28(%esi),%edx 205 rcrl $1,%eax 206 movl %ebp,-24(%edi) 207 rcrl $1,%edx 208 movl %eax,-28(%edi) 209 210 leal -32(%esi),%esi /* use leal not to clobber carry */ 211 leal -32(%edi),%edi 212 decl %ebx 213 jnz L(Loop) 214 215L(Lend): 216 popl %ebx 217 cfi_adjust_cfa_offset (-4) 218 sbbl %eax,%eax /* save carry in %eax */ 219 andl $7,%ebx 220 jz L(Lend2) 221 addl %eax,%eax /* restore carry from eax */ 222L(Loop2): 223 movl %edx,%ebp 224 movl (%esi),%edx 225 rcrl $1,%edx 226 movl %ebp,(%edi) 227 228 leal -4(%esi),%esi /* use leal not to clobber carry */ 229 leal -4(%edi),%edi 230 decl %ebx 231 jnz L(Loop2) 232 233 jmp L(L1) 234L(Lend2): 235 addl %eax,%eax /* restore carry from eax */ 236L(L1): movl %edx,(%edi) /* store last limb */ 237 238 movl $0,%eax 239 rcrl $1,%eax 240 241 popl %ebx 242 cfi_adjust_cfa_offset (-4) 243 cfi_restore (ebx) 244 popl %ebp 245 cfi_adjust_cfa_offset (-4) 246 cfi_restore (ebp) 247 popl %esi 248 cfi_adjust_cfa_offset (-4) 249 cfi_restore (esi) 250 popl %edi 251 cfi_adjust_cfa_offset (-4) 252 cfi_restore (edi) 253 254 ret 255END (__mpn_rshift) 256