1/* Function cosf vectorized with AVX2.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19
20#include <sysdep.h>
21#include "svml_s_trig_data.h"
22
23	.text
24ENTRY (_ZGVdN8v_cosf_avx2)
25/*
26  ALGORITHM DESCRIPTION:
27
28  1) Range reduction to [-Pi/2; +Pi/2] interval
29    a) We remove sign using AND operation
30    b) Add Pi/2 value to argument X for Cos to Sin transformation
31    c) Getting octant Y by 1/Pi multiplication
32    d) Add "Right Shifter" value
33    e) Treat obtained value as integer for destination sign setting.
34       Shift first bit of this value to the last (sign) position
35    f) Subtract "Right Shifter"  value
36    g) Subtract 0.5 from result for octant correction
37    h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39  2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40    a) Calculate X^2 = X * X
41    b) Calculate polynomial:
42         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
43  3) Destination sign setting
44    a) Set shifted destination sign using XOR operation:
45         R = XOR( R, S );
46 */
47        pushq     %rbp
48        cfi_adjust_cfa_offset (8)
49        cfi_rel_offset (%rbp, 0)
50        movq      %rsp, %rbp
51        cfi_def_cfa_register (%rbp)
52        andq      $-64, %rsp
53        subq      $448, %rsp
54        movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
55        vmovaps   %ymm0, %ymm2
56        vmovups __sRShifter(%rax), %ymm5
57        vmovups __sPI1_FMA(%rax), %ymm7
58
59/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
60        vaddps __sHalfPI(%rax), %ymm2, %ymm4
61
62/*
63  1) Range reduction to [-Pi/2; +Pi/2] interval
64  c) Getting octant Y by 1/Pi multiplication
65  d) Add "Right Shifter" (0x4B000000) value
66 */
67        vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
68
69/* f) Subtract "Right Shifter" (0x4B000000) value */
70        vsubps    %ymm5, %ymm4, %ymm6
71
72/*
73  e) Treat obtained value as integer for destination sign setting.
74  Shift first bit of this value to the last (sign) position (S << 31)
75 */
76        vpslld    $31, %ymm4, %ymm0
77
78/* g) Subtract 0.5 from result for octant correction */
79        vsubps __sOneHalf(%rax), %ymm6, %ymm4
80
81/* Check for large and special arguments */
82        vandps __sAbsMask(%rax), %ymm2, %ymm3
83        vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
84
85/*
86  h) Subtract Y*PI from X argument, where PI divided to 4 parts:
87  X = X - Y*PI1 - Y*PI2 - Y*PI3
88 */
89        vmovaps   %ymm2, %ymm3
90        vfnmadd231ps %ymm4, %ymm7, %ymm3
91        vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
92        vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
93
94/* a) Calculate X^2 = X * X */
95        vmulps    %ymm4, %ymm4, %ymm5
96
97/*
98  3) Destination sign setting
99  a) Set shifted destination sign using XOR operation:
100  R = XOR( R, S );
101 */
102        vxorps    %ymm0, %ymm4, %ymm6
103        vmovups __sA9_FMA(%rax), %ymm0
104
105/*
106  b) Calculate polynomial:
107  R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
108 */
109        vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
110        vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
111        vfmadd213ps __sA3(%rax), %ymm5, %ymm0
112        vmulps    %ymm5, %ymm0, %ymm0
113        vmovmskps %ymm1, %ecx
114        vfmadd213ps %ymm6, %ymm6, %ymm0
115        testl     %ecx, %ecx
116        jne       .LBL_1_3
117
118.LBL_1_2:
119        cfi_remember_state
120        movq      %rbp, %rsp
121        cfi_def_cfa_register (%rsp)
122        popq      %rbp
123        cfi_adjust_cfa_offset (-8)
124        cfi_restore (%rbp)
125        ret
126
127.LBL_1_3:
128        cfi_restore_state
129        vmovups   %ymm2, 320(%rsp)
130        vmovups   %ymm0, 384(%rsp)
131        je        .LBL_1_2
132
133        xorb      %dl, %dl
134        xorl      %eax, %eax
135        vmovups   %ymm8, 224(%rsp)
136        vmovups   %ymm9, 192(%rsp)
137        vmovups   %ymm10, 160(%rsp)
138        vmovups   %ymm11, 128(%rsp)
139        vmovups   %ymm12, 96(%rsp)
140        vmovups   %ymm13, 64(%rsp)
141        vmovups   %ymm14, 32(%rsp)
142        vmovups   %ymm15, (%rsp)
143        movq      %rsi, 264(%rsp)
144        movq      %rdi, 256(%rsp)
145        movq      %r12, 296(%rsp)
146        cfi_offset_rel_rsp (12, 296)
147        movb      %dl, %r12b
148        movq      %r13, 288(%rsp)
149        cfi_offset_rel_rsp (13, 288)
150        movl      %ecx, %r13d
151        movq      %r14, 280(%rsp)
152        cfi_offset_rel_rsp (14, 280)
153        movl      %eax, %r14d
154        movq      %r15, 272(%rsp)
155        cfi_offset_rel_rsp (15, 272)
156        cfi_remember_state
157
158.LBL_1_6:
159        btl       %r14d, %r13d
160        jc        .LBL_1_12
161
162.LBL_1_7:
163        lea       1(%r14), %esi
164        btl       %esi, %r13d
165        jc        .LBL_1_10
166
167.LBL_1_8:
168        incb      %r12b
169        addl      $2, %r14d
170        cmpb      $16, %r12b
171        jb        .LBL_1_6
172
173        vmovups   224(%rsp), %ymm8
174        vmovups   192(%rsp), %ymm9
175        vmovups   160(%rsp), %ymm10
176        vmovups   128(%rsp), %ymm11
177        vmovups   96(%rsp), %ymm12
178        vmovups   64(%rsp), %ymm13
179        vmovups   32(%rsp), %ymm14
180        vmovups   (%rsp), %ymm15
181        vmovups   384(%rsp), %ymm0
182        movq      264(%rsp), %rsi
183        movq      256(%rsp), %rdi
184        movq      296(%rsp), %r12
185        cfi_restore (%r12)
186        movq      288(%rsp), %r13
187        cfi_restore (%r13)
188        movq      280(%rsp), %r14
189        cfi_restore (%r14)
190        movq      272(%rsp), %r15
191        cfi_restore (%r15)
192        jmp       .LBL_1_2
193
194.LBL_1_10:
195        cfi_restore_state
196        movzbl    %r12b, %r15d
197        vmovss    324(%rsp,%r15,8), %xmm0
198        vzeroupper
199
200        call      JUMPTARGET(cosf)
201
202        vmovss    %xmm0, 388(%rsp,%r15,8)
203        jmp       .LBL_1_8
204
205.LBL_1_12:
206        movzbl    %r12b, %r15d
207        vmovss    320(%rsp,%r15,8), %xmm0
208        vzeroupper
209
210        call      JUMPTARGET(cosf)
211
212        vmovss    %xmm0, 384(%rsp,%r15,8)
213        jmp       .LBL_1_7
214
215END (_ZGVdN8v_cosf_avx2)
216