1/* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
2   the result in a second limb vector.
3   Copyright (C) 2003-2022 Free Software Foundation, Inc.
4   This file is part of the GNU MP Library.
5
6   The GNU MP Library is free software; you can redistribute it and/or modify
7   it under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or (at your
9   option) any later version.
10
11   The GNU MP Library is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14   License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with the GNU MP Library; see the file COPYING.LIB.  If not,
18   see <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include "asm-syntax.h"
22
23#define rp	%rdi
24#define up	%rsi
25#define n_param	%rdx
26#define vl	%rcx
27
28#define n	%r11
29
30	.text
31ENTRY (__mpn_mul_1)
32	push	%rbx
33	cfi_adjust_cfa_offset (8)
34	cfi_rel_offset (%rbx, 0)
35	xor	%r10, %r10
36	mov	(up), %rax		/* read first u limb early */
37	mov	n_param, %rbx		/* move away n from rdx, mul uses it */
38	mul	vl
39	mov	%rbx, %r11
40
41	add	%r10, %rax
42	adc	$0, %rdx
43
44	and	$3, %ebx
45	jz	L(b0)
46	cmp	$2, %ebx
47	jz	L(b2)
48	jg	L(b3)
49
50L(b1):	dec	n
51	jne	L(gt1)
52	mov	%rax, (rp)
53	jmp	L(ret)
54L(gt1):	lea	8(up,n,8), up
55	lea	-8(rp,n,8), rp
56	neg	n
57	xor	%r10, %r10
58	xor	%ebx, %ebx
59	mov	%rax, %r9
60	mov	(up,n,8), %rax
61	mov	%rdx, %r8
62	jmp	L(L1)
63
64L(b0):	lea	(up,n,8), up
65	lea	-16(rp,n,8), rp
66	neg	n
67	xor	%r10, %r10
68	mov	%rax, %r8
69	mov	%rdx, %rbx
70	jmp	L(L0)
71
72L(b3):	lea	-8(up,n,8), up
73	lea	-24(rp,n,8), rp
74	neg	n
75	mov	%rax, %rbx
76	mov	%rdx, %r10
77	jmp	L(L3)
78
79L(b2):	lea	-16(up,n,8), up
80	lea	-32(rp,n,8), rp
81	neg	n
82	xor	%r8, %r8
83	xor	%ebx, %ebx
84	mov	%rax, %r10
85	mov	24(up,n,8), %rax
86	mov	%rdx, %r9
87	jmp	L(L2)
88
89	.p2align 4
90L(top): mov	%r10, (rp,n,8)
91	add	%rax, %r9
92	mov	(up,n,8), %rax
93	adc	%rdx, %r8
94	mov	$0, %r10d
95L(L1):	mul	vl
96	mov	%r9, 8(rp,n,8)
97	add	%rax, %r8
98	adc	%rdx, %rbx
99L(L0):	mov	8(up,n,8), %rax
100	mul	vl
101	mov	%r8, 16(rp,n,8)
102	add	%rax, %rbx
103	adc	%rdx, %r10
104L(L3):	mov	16(up,n,8), %rax
105	mul	vl
106	mov	%rbx, 24(rp,n,8)
107	mov	$0, %r8d                # zero
108	mov	%r8, %rbx               # zero
109	add	%rax, %r10
110	mov	24(up,n,8), %rax
111	mov	%r8, %r9                # zero
112	adc	%rdx, %r9
113L(L2):	mul	vl
114	add	$4, n
115	js	L(top)
116
117	mov	%r10, (rp,n,8)
118	add	%rax, %r9
119	adc	%r8, %rdx
120	mov	%r9, 8(rp,n,8)
121	add	%r8, %rdx
122L(ret):	mov	%rdx, %rax
123
124	pop	%rbx
125	cfi_adjust_cfa_offset (-8)
126	cfi_restore (%rbx)
127	ret
128END (__mpn_mul_1)
129