1 /* Wrapper implementations of vector math functions.
2    Copyright (C) 2014-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 /* SSE2 ISA version as wrapper to scalar.  */
20 .macro WRAPPER_IMPL_SSE2 callee
21         subq      $40, %rsp
22         cfi_adjust_cfa_offset(40)
23         movaps    %xmm0, (%rsp)
24         call      JUMPTARGET(\callee)
25         movss     %xmm0, 16(%rsp)
26         movss     4(%rsp), %xmm0
27         call      JUMPTARGET(\callee)
28         movss     %xmm0, 20(%rsp)
29         movss     8(%rsp), %xmm0
30         call      JUMPTARGET(\callee)
31         movss     %xmm0, 24(%rsp)
32         movss     12(%rsp), %xmm0
33         call      JUMPTARGET(\callee)
34         movss     16(%rsp), %xmm3
35         movss     20(%rsp), %xmm2
36         movss     24(%rsp), %xmm1
37         movss     %xmm0, 28(%rsp)
38         unpcklps  %xmm1, %xmm3
39         unpcklps  %xmm0, %xmm2
40         unpcklps  %xmm2, %xmm3
41         movaps    %xmm3, %xmm0
42         addq      $40, %rsp
43         cfi_adjust_cfa_offset(-40)
44         ret
45 .endm
46 
47 /* 2 argument SSE2 ISA version as wrapper to scalar.  */
48 .macro WRAPPER_IMPL_SSE2_ff callee
49         subq      $56, %rsp
50         cfi_adjust_cfa_offset(56)
51         movaps    %xmm0, (%rsp)
52         movaps    %xmm1, 16(%rsp)
53         call      JUMPTARGET(\callee)
54         movss     %xmm0, 32(%rsp)
55         movss     4(%rsp), %xmm0
56         movss     20(%rsp), %xmm1
57         call      JUMPTARGET(\callee)
58         movss     %xmm0, 36(%rsp)
59         movss     8(%rsp), %xmm0
60         movss     24(%rsp), %xmm1
61         call      JUMPTARGET(\callee)
62         movss     %xmm0, 40(%rsp)
63         movss     12(%rsp), %xmm0
64         movss     28(%rsp), %xmm1
65         call      JUMPTARGET(\callee)
66         movss     32(%rsp), %xmm3
67         movss     36(%rsp), %xmm2
68         movss     40(%rsp), %xmm1
69         movss     %xmm0, 44(%rsp)
70         unpcklps  %xmm1, %xmm3
71         unpcklps  %xmm0, %xmm2
72         unpcklps  %xmm2, %xmm3
73         movaps    %xmm3, %xmm0
74         addq      $56, %rsp
75         cfi_adjust_cfa_offset(-56)
76         ret
77 .endm
78 
79 /* 3 argument SSE2 ISA version as wrapper to scalar.  */
80 .macro WRAPPER_IMPL_SSE2_fFF callee
81         pushq   %rbp
82         cfi_adjust_cfa_offset (8)
83         cfi_rel_offset (%rbp, 0)
84         pushq   %rbx
85         cfi_adjust_cfa_offset (8)
86         cfi_rel_offset (%rbx, 0)
87         movq    %rdi, %rbp
88         movq    %rsi, %rbx
89         subq    $40, %rsp
90         cfi_adjust_cfa_offset(40)
91         leaq    24(%rsp), %rsi
92         leaq    28(%rsp), %rdi
93         movaps  %xmm0, (%rsp)
94         call    JUMPTARGET(\callee)
95         leaq    24(%rsp), %rsi
96         leaq    28(%rsp), %rdi
97         movss   28(%rsp), %xmm0
98         movss   %xmm0, 0(%rbp)
99         movaps  (%rsp), %xmm1
100         movss   24(%rsp), %xmm0
101         movss   %xmm0, (%rbx)
102         movaps  %xmm1, %xmm0
103         shufps  $85, %xmm1, %xmm0
104         call    JUMPTARGET(\callee)
105         movss   28(%rsp), %xmm0
106         leaq    24(%rsp), %rsi
107         movss   %xmm0, 4(%rbp)
108         leaq    28(%rsp), %rdi
109         movaps  (%rsp), %xmm1
110         movss   24(%rsp), %xmm0
111         movss   %xmm0, 4(%rbx)
112         movaps  %xmm1, %xmm0
113         unpckhps        %xmm1, %xmm0
114         call    JUMPTARGET(\callee)
115         movaps  (%rsp), %xmm1
116         leaq    24(%rsp), %rsi
117         leaq    28(%rsp), %rdi
118         movss   28(%rsp), %xmm0
119         shufps  $255, %xmm1, %xmm1
120         movss   %xmm0, 8(%rbp)
121         movss   24(%rsp), %xmm0
122         movss   %xmm0, 8(%rbx)
123         movaps  %xmm1, %xmm0
124         call    JUMPTARGET(\callee)
125         movss   28(%rsp), %xmm0
126         movss   %xmm0, 12(%rbp)
127         movss   24(%rsp), %xmm0
128         movss   %xmm0, 12(%rbx)
129         addq    $40, %rsp
130         cfi_adjust_cfa_offset(-40)
131         popq    %rbx
132         cfi_adjust_cfa_offset (-8)
133         cfi_restore (%rbx)
134         popq    %rbp
135         cfi_adjust_cfa_offset (-8)
136         cfi_restore (%rbp)
137         ret
138 .endm
139 
140 /* AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
141 .macro WRAPPER_IMPL_AVX callee
142         pushq     	%rbp
143         cfi_adjust_cfa_offset (8)
144         cfi_rel_offset (%rbp, 0)
145         movq      	%rsp, %rbp
146         cfi_def_cfa_register (%rbp)
147         andq      	$-32, %rsp
148         subq      	$32, %rsp
149         vextractf128 	$1, %ymm0, (%rsp)
150         vzeroupper
151         call      	HIDDEN_JUMPTARGET(\callee)
152         vmovaps   	%xmm0, 16(%rsp)
153         vmovaps   	(%rsp), %xmm0
154         call      	HIDDEN_JUMPTARGET(\callee)
155         vmovaps   	%xmm0, %xmm1
156         vmovaps   	16(%rsp), %xmm0
157         vinsertf128 	$1, %xmm1, %ymm0, %ymm0
158         movq      	%rbp, %rsp
159         cfi_def_cfa_register (%rsp)
160         popq      	%rbp
161         cfi_adjust_cfa_offset (-8)
162         cfi_restore (%rbp)
163         ret
164 .endm
165 
166 /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
167 .macro WRAPPER_IMPL_AVX_ff callee
168         pushq     %rbp
169         cfi_adjust_cfa_offset (8)
170         cfi_rel_offset (%rbp, 0)
171         movq      %rsp, %rbp
172         cfi_def_cfa_register (%rbp)
173         andq      $-32, %rsp
174         subq      $64, %rsp
175         vextractf128 $1, %ymm0, 16(%rsp)
176         vextractf128 $1, %ymm1, (%rsp)
177         vzeroupper
178         call      HIDDEN_JUMPTARGET(\callee)
179         vmovaps   %xmm0, 32(%rsp)
180         vmovaps   16(%rsp), %xmm0
181         vmovaps   (%rsp), %xmm1
182         call      HIDDEN_JUMPTARGET(\callee)
183         vmovaps   %xmm0, %xmm1
184         vmovaps   32(%rsp), %xmm0
185         vinsertf128 $1, %xmm1, %ymm0, %ymm0
186         movq      %rbp, %rsp
187         cfi_def_cfa_register (%rsp)
188         popq      %rbp
189         cfi_adjust_cfa_offset (-8)
190         cfi_restore (%rbp)
191         ret
192 .endm
193 
194 /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version.  */
195 .macro WRAPPER_IMPL_AVX_fFF callee
196         pushq     %rbp
197         cfi_adjust_cfa_offset (8)
198         cfi_rel_offset (%rbp, 0)
199         movq      %rsp, %rbp
200         cfi_def_cfa_register (%rbp)
201         andq      $-32, %rsp
202         pushq     %r13
203         cfi_adjust_cfa_offset (8)
204         cfi_rel_offset (%r13, 0)
205         pushq     %r14
206         cfi_adjust_cfa_offset (8)
207         cfi_rel_offset (%r14, 0)
208         subq      $48, %rsp
209         movq      %rsi, %r14
210         vmovaps   %ymm0, (%rsp)
211         movq      %rdi, %r13
212         vmovaps   16(%rsp), %xmm1
213         vmovaps   %xmm1, 32(%rsp)
214         vzeroupper
215         vmovaps   (%rsp), %xmm0
216         call      HIDDEN_JUMPTARGET(\callee)
217         vmovaps   32(%rsp), %xmm0
218         lea       (%rsp), %rdi
219         lea       16(%rsp), %rsi
220         call      HIDDEN_JUMPTARGET(\callee)
221         vmovaps   (%rsp), %xmm0
222         vmovaps   16(%rsp), %xmm1
223         vmovaps   %xmm0, 16(%r13)
224         vmovaps   %xmm1, 16(%r14)
225         addq      $48, %rsp
226         popq      %r14
227         cfi_adjust_cfa_offset (-8)
228         cfi_restore (%r14)
229         popq      %r13
230         cfi_adjust_cfa_offset (-8)
231         cfi_restore (%r13)
232         movq      %rbp, %rsp
233         cfi_def_cfa_register (%rsp)
234         popq      %rbp
235         cfi_adjust_cfa_offset (-8)
236         cfi_restore (%rbp)
237         ret
238 .endm
239 
240 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
241 .macro WRAPPER_IMPL_AVX512 callee
242         pushq     %rbp
243         cfi_adjust_cfa_offset (8)
244         cfi_rel_offset (%rbp, 0)
245         movq      %rsp, %rbp
246         cfi_def_cfa_register (%rbp)
247         andq      $-64, %rsp
248         subq      $128, %rsp
249         vmovups   %zmm0, (%rsp)
250         vmovupd   (%rsp), %ymm0
251         call      HIDDEN_JUMPTARGET(\callee)
252         vmovupd   %ymm0, 64(%rsp)
253         vmovupd   32(%rsp), %ymm0
254         call      HIDDEN_JUMPTARGET(\callee)
255         vmovupd   %ymm0, 96(%rsp)
256         vmovups   64(%rsp), %zmm0
257         movq      %rbp, %rsp
258         cfi_def_cfa_register (%rsp)
259         popq      %rbp
260         cfi_adjust_cfa_offset (-8)
261         cfi_restore (%rbp)
262         ret
263 .endm
264 
265 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
266 .macro WRAPPER_IMPL_AVX512_ff callee
267         pushq     %rbp
268         cfi_adjust_cfa_offset (8)
269         cfi_rel_offset (%rbp, 0)
270         movq      %rsp, %rbp
271         cfi_def_cfa_register (%rbp)
272         andq      $-64, %rsp
273         subq      $192, %rsp
274         vmovups   %zmm0, (%rsp)
275         vmovups   %zmm1, 64(%rsp)
276         vmovups   (%rsp), %ymm0
277         vmovups   64(%rsp), %ymm1
278         call      HIDDEN_JUMPTARGET(\callee)
279         vmovups   %ymm0, 128(%rsp)
280         vmovups   32(%rsp), %ymm0
281         vmovups   96(%rsp), %ymm1
282         call      HIDDEN_JUMPTARGET(\callee)
283         vmovups   %ymm0, 160(%rsp)
284         vmovups   128(%rsp), %zmm0
285         movq      %rbp, %rsp
286         cfi_def_cfa_register (%rsp)
287         popq      %rbp
288         cfi_adjust_cfa_offset (-8)
289         cfi_restore (%rbp)
290         ret
291 .endm
292 
293 /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
294 .macro WRAPPER_IMPL_AVX512_fFF callee
295         pushq     %rbp
296         cfi_adjust_cfa_offset (8)
297         cfi_rel_offset (%rbp, 0)
298         movq	%rsp, %rbp
299         cfi_def_cfa_register (%rbp)
300         andq      $-64, %rsp
301         pushq     %r12
302         pushq     %r13
303         subq      $176, %rsp
304         movq      %rsi, %r13
305         vmovaps   %zmm0, (%rsp)
306         movq      %rdi, %r12
307         vmovaps   (%rsp), %ymm0
308         call      HIDDEN_JUMPTARGET(\callee)
309         vmovaps   32(%rsp), %ymm0
310         lea       64(%rsp), %rdi
311         lea       96(%rsp), %rsi
312         call      HIDDEN_JUMPTARGET(\callee)
313         vmovaps   64(%rsp), %ymm0
314         vmovaps   96(%rsp), %ymm1
315         vmovaps   %ymm0, 32(%r12)
316         vmovaps   %ymm1, 32(%r13)
317         addq      $176, %rsp
318         popq      %r13
319         popq      %r12
320         movq      %rbp, %rsp
321         cfi_def_cfa_register (%rsp)
322         popq	%rbp
323         cfi_adjust_cfa_offset (-8)
324         cfi_restore (%rbp)
325         ret
326 .endm
327