1/* Function sincosf vectorized with SSE4.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21
22	.text
23ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
24/*
25   ALGORITHM DESCRIPTION:
26
27     1) Range reduction to [-Pi/4; +Pi/4] interval
28        a) Grab sign from source argument and save it.
29        b) Remove sign using AND operation
30        c) Getting octant Y by 2/Pi multiplication
31        d) Add "Right Shifter" value
32        e) Treat obtained value as integer S for destination sign setting.
33           SS = ((S-S&1)&2)<<30; For sin part
34           SC = ((S+S&1)&2)<<30; For cos part
35        f) Change destination sign if source sign is negative
36           using XOR operation.
37        g) Subtract "Right Shifter" (0x4B000000) value
38        h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
39           X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40     2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
41        a) Calculate X^2 = X * X
42        b) Calculate 2 polynomials for sin and cos:
43           RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
44           RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
45        c) Swap RS & RC if first bit of obtained value after
46           Right Shifting is set to 1. Using And, Andnot & Or operations.
47     3) Destination sign setting
48        a) Set shifted destination sign using XOR operation:
49           R1 = XOR( RS, SS );
50           R2 = XOR( RC, SC ).  */
51
52        pushq     %rbp
53        cfi_adjust_cfa_offset (8)
54        cfi_rel_offset (%rbp, 0)
55        movq      %rsp, %rbp
56        cfi_def_cfa_register (%rbp)
57        andq      $-64, %rsp
58        subq      $320, %rsp
59        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
60        movups    %xmm12, 176(%rsp)
61        movups    %xmm9, 160(%rsp)
62        movups __sAbsMask(%rax), %xmm12
63
64/* Absolute argument computation */
65        movaps    %xmm12, %xmm5
66        andnps    %xmm0, %xmm12
67        movups __sInvPI(%rax), %xmm7
68        andps     %xmm0, %xmm5
69
70/* c) Getting octant Y by 2/Pi multiplication
71   d) Add "Right Shifter" value.  */
72        mulps     %xmm5, %xmm7
73        movups    %xmm10, 144(%rsp)
74        movups __sPI1(%rax), %xmm10
75
76/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
77      X = X - Y*PI1 - Y*PI2 - Y*PI3.  */
78        movaps    %xmm10, %xmm1
79        addps __sRShifter(%rax), %xmm7
80
81/* e) Treat obtained value as integer S for destination sign setting */
82        movaps    %xmm7, %xmm9
83
84/* g) Subtract "Right Shifter" (0x4B000000) value */
85        subps __sRShifter(%rax), %xmm7
86        mulps     %xmm7, %xmm1
87        pslld     $31, %xmm9
88        movups __sPI2(%rax), %xmm6
89        movups    %xmm13, 112(%rsp)
90        movaps    %xmm5, %xmm13
91        movaps    %xmm6, %xmm2
92        subps     %xmm1, %xmm13
93        mulps     %xmm7, %xmm2
94        movups __sSignMask(%rax), %xmm3
95        movaps    %xmm5, %xmm1
96        movups __sOneHalf(%rax), %xmm4
97        subps     %xmm2, %xmm13
98        cmpnleps __sRangeReductionVal(%rax), %xmm5
99        movaps    %xmm3, %xmm2
100        andps     %xmm13, %xmm2
101        xorps     %xmm2, %xmm4
102
103/* Result sign calculations */
104        xorps     %xmm2, %xmm3
105        xorps     %xmm9, %xmm3
106
107/* Add correction term 0.5 for cos() part */
108        addps     %xmm7, %xmm4
109        movmskps  %xmm5, %ecx
110        mulps     %xmm4, %xmm10
111        mulps     %xmm4, %xmm6
112        subps     %xmm10, %xmm1
113        movups __sPI3(%rax), %xmm10
114        subps     %xmm6, %xmm1
115        movaps    %xmm10, %xmm6
116        mulps     %xmm7, %xmm6
117        mulps     %xmm4, %xmm10
118        subps     %xmm6, %xmm13
119        subps     %xmm10, %xmm1
120        movups __sPI4(%rax), %xmm6
121        mulps     %xmm6, %xmm7
122        mulps     %xmm6, %xmm4
123        subps     %xmm7, %xmm13
124        subps     %xmm4, %xmm1
125        xorps     %xmm9, %xmm13
126        xorps     %xmm3, %xmm1
127        movaps    %xmm13, %xmm4
128        movaps    %xmm1, %xmm2
129        mulps     %xmm13, %xmm4
130        mulps     %xmm1, %xmm2
131        movups __sA9(%rax), %xmm7
132
133/* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
134      a) Calculate X^2 = X * X
135      b) Calculate 2 polynomials for sin and cos:
136         RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
137         RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
138        movaps    %xmm7, %xmm3
139        mulps     %xmm4, %xmm3
140        mulps     %xmm2, %xmm7
141        addps __sA7(%rax), %xmm3
142        addps __sA7(%rax), %xmm7
143        mulps     %xmm4, %xmm3
144        mulps     %xmm2, %xmm7
145        addps __sA5(%rax), %xmm3
146        addps __sA5(%rax), %xmm7
147        mulps     %xmm4, %xmm3
148        mulps     %xmm2, %xmm7
149        addps __sA3(%rax), %xmm3
150        addps __sA3(%rax), %xmm7
151        mulps     %xmm3, %xmm4
152        mulps     %xmm7, %xmm2
153        mulps     %xmm13, %xmm4
154        mulps     %xmm1, %xmm2
155        addps     %xmm4, %xmm13
156        addps     %xmm2, %xmm1
157        xorps     %xmm12, %xmm13
158        testl     %ecx, %ecx
159        jne       .LBL_1_3
160
161.LBL_1_2:
162        cfi_remember_state
163        movups    160(%rsp), %xmm9
164        movaps    %xmm13, (%rdi)
165        movups    144(%rsp), %xmm10
166        movups    176(%rsp), %xmm12
167        movups    112(%rsp), %xmm13
168        movups    %xmm1, (%rsi)
169        movq      %rbp, %rsp
170        cfi_def_cfa_register (%rsp)
171        popq      %rbp
172        cfi_adjust_cfa_offset (-8)
173        cfi_restore (%rbp)
174        ret
175
176.LBL_1_3:
177        cfi_restore_state
178        movups    %xmm0, 128(%rsp)
179        movups    %xmm13, 192(%rsp)
180        movups    %xmm1, 256(%rsp)
181        je        .LBL_1_2
182
183        xorb      %dl, %dl
184        xorl      %eax, %eax
185        movups    %xmm8, 48(%rsp)
186        movups    %xmm11, 32(%rsp)
187        movups    %xmm14, 16(%rsp)
188        movups    %xmm15, (%rsp)
189        movq      %rsi, 64(%rsp)
190        movq      %r12, 104(%rsp)
191        cfi_offset_rel_rsp (12, 104)
192        movb      %dl, %r12b
193        movq      %r13, 96(%rsp)
194        cfi_offset_rel_rsp (13, 96)
195        movl      %eax, %r13d
196        movq      %r14, 88(%rsp)
197        cfi_offset_rel_rsp (14, 88)
198        movl      %ecx, %r14d
199        movq      %r15, 80(%rsp)
200        cfi_offset_rel_rsp (15, 80)
201        movq      %rbx, 72(%rsp)
202        movq      %rdi, %rbx
203        cfi_remember_state
204
205.LBL_1_6:
206        btl       %r13d, %r14d
207        jc        .LBL_1_13
208
209.LBL_1_7:
210        lea       1(%r13), %esi
211        btl       %esi, %r14d
212        jc        .LBL_1_10
213
214.LBL_1_8:
215        incb      %r12b
216        addl      $2, %r13d
217        cmpb      $16, %r12b
218        jb        .LBL_1_6
219
220        movups    48(%rsp), %xmm8
221        movq      %rbx, %rdi
222        movups    32(%rsp), %xmm11
223        movups    16(%rsp), %xmm14
224        movups    (%rsp), %xmm15
225        movq      64(%rsp), %rsi
226        movq      104(%rsp), %r12
227        cfi_restore (%r12)
228        movq      96(%rsp), %r13
229        cfi_restore (%r13)
230        movq      88(%rsp), %r14
231        cfi_restore (%r14)
232        movq      80(%rsp), %r15
233        cfi_restore (%r15)
234        movq      72(%rsp), %rbx
235        movups    192(%rsp), %xmm13
236        movups    256(%rsp), %xmm1
237        jmp       .LBL_1_2
238
239.LBL_1_10:
240        cfi_restore_state
241        movzbl    %r12b, %r15d
242        movss     132(%rsp,%r15,8), %xmm0
243
244        call      JUMPTARGET(sinf)
245
246        movss     %xmm0, 196(%rsp,%r15,8)
247        movss     132(%rsp,%r15,8), %xmm0
248
249        call      JUMPTARGET(cosf)
250
251        movss     %xmm0, 260(%rsp,%r15,8)
252        jmp       .LBL_1_8
253
254.LBL_1_13:
255        movzbl    %r12b, %r15d
256        movss     128(%rsp,%r15,8), %xmm0
257
258        call      JUMPTARGET(sinf)
259
260        movss     %xmm0, 192(%rsp,%r15,8)
261        movss     128(%rsp,%r15,8), %xmm0
262
263        call      JUMPTARGET(cosf)
264
265        movss     %xmm0, 256(%rsp,%r15,8)
266        jmp       .LBL_1_7
267
268END (_ZGVbN4vl4l4_sincosf_sse4)
269libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
270
271/* vvv version implemented with wrapper to vl4l4 variant.  */
272ENTRY (_ZGVbN4vvv_sincosf_sse4)
273#ifndef __ILP32__
274        subq      $104, %rsp
275        .cfi_def_cfa_offset 112
276        movdqu    %xmm1, 32(%rsp)
277        lea       (%rsp), %rdi
278        movdqu    %xmm2, 48(%rdi)
279        lea       16(%rsp), %rsi
280        movdqu    %xmm3, 48(%rsi)
281        movdqu    %xmm4, 64(%rsi)
282        call      HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
283        movq      32(%rsp), %rdx
284        movq      40(%rsp), %rsi
285        movq      48(%rsp), %r8
286        movq      56(%rsp), %r10
287        movl      (%rsp), %eax
288        movl      4(%rsp), %ecx
289        movl      8(%rsp), %edi
290        movl      12(%rsp), %r9d
291        movl      %eax, (%rdx)
292        movl      %ecx, (%rsi)
293        movq      64(%rsp), %rax
294        movq      72(%rsp), %rcx
295        movl      %edi, (%r8)
296        movl      %r9d, (%r10)
297        movq      80(%rsp), %rdi
298        movq      88(%rsp), %r9
299        movl      16(%rsp), %r11d
300        movl      20(%rsp), %edx
301        movl      24(%rsp), %esi
302        movl      28(%rsp), %r8d
303        movl      %r11d, (%rax)
304        movl      %edx, (%rcx)
305        movl      %esi, (%rdi)
306        movl      %r8d, (%r9)
307        addq      $104, %rsp
308        .cfi_def_cfa_offset 8
309        ret
310#else
311        subl    $72, %esp
312        .cfi_def_cfa_offset 80
313        leal    48(%rsp), %esi
314        movaps  %xmm1, 16(%esp)
315        leal    32(%rsp), %edi
316        movaps  %xmm2, (%esp)
317        call    HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
318        movl    16(%esp), %eax
319        movss   32(%esp), %xmm0
320        movss   %xmm0, (%eax)
321        movl    20(%esp), %eax
322        movss   36(%esp), %xmm0
323        movss   %xmm0, (%eax)
324        movl    24(%esp), %eax
325        movss   40(%esp), %xmm0
326        movss   %xmm0, (%eax)
327        movl    28(%esp), %eax
328        movss   44(%esp), %xmm0
329        movss   %xmm0, (%eax)
330        movl    (%esp), %eax
331        movss   48(%esp), %xmm0
332        movss   %xmm0, (%eax)
333        movl    4(%esp), %eax
334        movss   52(%esp), %xmm0
335        movss   %xmm0, (%eax)
336        movl    8(%esp), %eax
337        movss   56(%esp), %xmm0
338        movss   %xmm0, (%eax)
339        movl    12(%esp), %eax
340        movss   60(%esp), %xmm0
341        movss   %xmm0, (%eax)
342        addl    $72, %esp
343        .cfi_def_cfa_offset 8
344        ret
345#endif
346END (_ZGVbN4vvv_sincosf_sse4)
347