1/* Pentium optimized __mpn_rshift --
2   Copyright (C) 1992-2022 Free Software Foundation, Inc.
3   This file is part of the GNU MP Library.
4
5   The GNU MP Library is free software; you can redistribute it and/or modify
6   it under the terms of the GNU Lesser General Public License as published by
7   the Free Software Foundation; either version 2.1 of the License, or (at your
8   option) any later version.
9
10   The GNU MP Library is distributed in the hope that it will be useful, but
11   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
13   License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with the GNU MP Library; see the file COPYING.LIB.  If not,
17   see <https://www.gnu.org/licenses/>.  */
18
19#include "sysdep.h"
20#include "asm-syntax.h"
21
22#define PARMS	4+16		/* space for 4 saved regs */
23#define RES	PARMS
24#define S	RES+4
25#define SIZE	S+4
26#define CNT	SIZE+4
27
28	.text
29ENTRY (__mpn_rshift)
30
31	pushl	%edi
32	cfi_adjust_cfa_offset (4)
33	pushl	%esi
34	cfi_adjust_cfa_offset (4)
35	pushl	%ebp
36	cfi_adjust_cfa_offset (4)
37	cfi_rel_offset (ebp, 0)
38	pushl	%ebx
39	cfi_adjust_cfa_offset (4)
40
41	movl	RES(%esp),%edi
42	cfi_rel_offset (edi, 12)
43	movl	S(%esp),%esi
44	cfi_rel_offset (esi, 8)
45	movl	SIZE(%esp),%ebx
46	cfi_rel_offset (ebx, 0)
47	movl	CNT(%esp),%ecx
48
49/* We can use faster code for shift-by-1 under certain conditions.  */
50	cmp	$1,%ecx
51	jne	L(normal)
52	leal	4(%edi),%eax
53	cmpl	%esi,%eax
54	jnc	L(special)		/* jump if res_ptr + 1 >= s_ptr */
55	leal	(%edi,%ebx,4),%eax
56	cmpl	%eax,%esi
57	jnc	L(special)		/* jump if s_ptr >= res_ptr + size */
58
59L(normal):
60	movl	(%esi),%edx
61	addl	$4,%esi
62	xorl	%eax,%eax
63	shrdl	%cl,%edx,%eax		/* compute carry limb */
64	pushl	%eax			/* push carry limb onto stack */
65	cfi_adjust_cfa_offset (4)
66
67	decl	%ebx
68	pushl	%ebx
69	cfi_adjust_cfa_offset (4)
70	shrl	$3,%ebx
71	jz	L(end)
72
73	movl	(%edi),%eax		/* fetch destination cache line */
74
75	ALIGN	(2)
76L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
77	movl	%edx,%ebp
78
79	movl	(%esi),%eax
80	movl	4(%esi),%edx
81	shrdl	%cl,%eax,%ebp
82	shrdl	%cl,%edx,%eax
83	movl	%ebp,(%edi)
84	movl	%eax,4(%edi)
85
86	movl	8(%esi),%ebp
87	movl	12(%esi),%eax
88	shrdl	%cl,%ebp,%edx
89	shrdl	%cl,%eax,%ebp
90	movl	%edx,8(%edi)
91	movl	%ebp,12(%edi)
92
93	movl	16(%esi),%edx
94	movl	20(%esi),%ebp
95	shrdl	%cl,%edx,%eax
96	shrdl	%cl,%ebp,%edx
97	movl	%eax,16(%edi)
98	movl	%edx,20(%edi)
99
100	movl	24(%esi),%eax
101	movl	28(%esi),%edx
102	shrdl	%cl,%eax,%ebp
103	shrdl	%cl,%edx,%eax
104	movl	%ebp,24(%edi)
105	movl	%eax,28(%edi)
106
107	addl	$32,%esi
108	addl	$32,%edi
109	decl	%ebx
110	jnz	L(oop)
111
112L(end):	popl	%ebx
113	cfi_adjust_cfa_offset (-4)
114	andl	$7,%ebx
115	jz	L(end2)
116L(oop2):
117	movl	(%esi),%eax
118	shrdl	%cl,%eax,%edx		/* compute result limb */
119	movl	%edx,(%edi)
120	movl	%eax,%edx
121	addl	$4,%esi
122	addl	$4,%edi
123	decl	%ebx
124	jnz	L(oop2)
125
126L(end2):
127	shrl	%cl,%edx		/* compute most significant limb */
128	movl	%edx,(%edi)		/* store it */
129
130	popl	%eax			/* pop carry limb */
131	cfi_adjust_cfa_offset (-4)
132
133	popl	%ebx
134	cfi_adjust_cfa_offset (-4)
135	cfi_restore (ebx)
136	popl	%ebp
137	cfi_adjust_cfa_offset (-4)
138	cfi_restore (ebp)
139	popl	%esi
140	cfi_adjust_cfa_offset (-4)
141	cfi_restore (esi)
142	popl	%edi
143	cfi_adjust_cfa_offset (-4)
144	cfi_restore (edi)
145
146	ret
147
148/* We loop from least significant end of the arrays, which is only
149   permissible if the source and destination don't overlap, since the
150   function is documented to work for overlapping source and destination.
151*/
152
153	cfi_adjust_cfa_offset (16)
154	cfi_rel_offset (edi, 12)
155	cfi_rel_offset (esi, 8)
156	cfi_rel_offset (ebp, 4)
157	cfi_rel_offset (ebx, 0)
158L(special):
159	leal	-4(%edi,%ebx,4),%edi
160	leal	-4(%esi,%ebx,4),%esi
161
162	movl	(%esi),%edx
163	subl	$4,%esi
164
165	decl	%ebx
166	pushl	%ebx
167	cfi_adjust_cfa_offset (4)
168	shrl	$3,%ebx
169
170	shrl	$1,%edx
171	incl	%ebx
172	decl	%ebx
173	jz	L(Lend)
174
175	movl	(%edi),%eax		/* fetch destination cache line */
176
177	ALIGN	(2)
178L(Loop):
179	movl	-28(%edi),%eax		/* fetch destination cache line */
180	movl	%edx,%ebp
181
182	movl	(%esi),%eax
183	movl	-4(%esi),%edx
184	rcrl	$1,%eax
185	movl	%ebp,(%edi)
186	rcrl	$1,%edx
187	movl	%eax,-4(%edi)
188
189	movl	-8(%esi),%ebp
190	movl	-12(%esi),%eax
191	rcrl	$1,%ebp
192	movl	%edx,-8(%edi)
193	rcrl	$1,%eax
194	movl	%ebp,-12(%edi)
195
196	movl	-16(%esi),%edx
197	movl	-20(%esi),%ebp
198	rcrl	$1,%edx
199	movl	%eax,-16(%edi)
200	rcrl	$1,%ebp
201	movl	%edx,-20(%edi)
202
203	movl	-24(%esi),%eax
204	movl	-28(%esi),%edx
205	rcrl	$1,%eax
206	movl	%ebp,-24(%edi)
207	rcrl	$1,%edx
208	movl	%eax,-28(%edi)
209
210	leal	-32(%esi),%esi		/* use leal not to clobber carry */
211	leal	-32(%edi),%edi
212	decl	%ebx
213	jnz	L(Loop)
214
215L(Lend):
216	popl	%ebx
217	cfi_adjust_cfa_offset (-4)
218	sbbl	%eax,%eax		/* save carry in %eax */
219	andl	$7,%ebx
220	jz	L(Lend2)
221	addl	%eax,%eax		/* restore carry from eax */
222L(Loop2):
223	movl	%edx,%ebp
224	movl	(%esi),%edx
225	rcrl	$1,%edx
226	movl	%ebp,(%edi)
227
228	leal	-4(%esi),%esi		/* use leal not to clobber carry */
229	leal	-4(%edi),%edi
230	decl	%ebx
231	jnz	L(Loop2)
232
233	jmp	L(L1)
234L(Lend2):
235	addl	%eax,%eax		/* restore carry from eax */
236L(L1):	movl	%edx,(%edi)		/* store last limb */
237
238	movl	$0,%eax
239	rcrl	$1,%eax
240
241	popl	%ebx
242	cfi_adjust_cfa_offset (-4)
243	cfi_restore (ebx)
244	popl	%ebp
245	cfi_adjust_cfa_offset (-4)
246	cfi_restore (ebp)
247	popl	%esi
248	cfi_adjust_cfa_offset (-4)
249	cfi_restore (esi)
250	popl	%edi
251	cfi_adjust_cfa_offset (-4)
252	cfi_restore (edi)
253
254	ret
255END (__mpn_rshift)
256