1/* Function sin vectorized with AVX-512, KNL and SKX versions.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "svml_d_trig_data.h"
21#include "svml_d_wrapper_impl.h"
22
23	.text
24ENTRY (_ZGVeN8v_sin_knl)
25/*
26   ALGORITHM DESCRIPTION:
27
28      ( low accuracy ( < 4ulp ) or enhanced performance
29      ( half of correct mantissa ) implementation )
30
31      Argument representation:
32      arg = N*Pi + R
33
34      Result calculation:
35      sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
36      sin(R) is approximated by corresponding polynomial
37 */
38        pushq     %rbp
39        cfi_adjust_cfa_offset (8)
40        cfi_rel_offset (%rbp, 0)
41        movq      %rsp, %rbp
42        cfi_def_cfa_register (%rbp)
43        andq      $-64, %rsp
44        subq      $1280, %rsp
45        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
46        movq      $-1, %rdx
47        vmovups __dAbsMask(%rax), %zmm6
48        vmovups __dInvPI(%rax), %zmm1
49
50/*
51   ARGUMENT RANGE REDUCTION:
52   X' = |X|
53 */
54        vpandq    %zmm6, %zmm0, %zmm12
55        vmovups __dPI1_FMA(%rax), %zmm2
56        vmovups __dC7_sin(%rax), %zmm7
57
58/* SignX - sign bit of X */
59        vpandnq   %zmm0, %zmm6, %zmm11
60
61/* R = X' - N*Pi1 */
62        vmovaps   %zmm12, %zmm3
63
64/* Y = X'*InvPi + RS : right shifter add */
65        vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1
66        vcmppd    $22, __dRangeVal(%rax), %zmm12, %k1
67        vpbroadcastq %rdx, %zmm13{%k1}{z}
68
69/* N = Y - RS : right shifter sub */
70        vsubpd __dRShifter(%rax), %zmm1, %zmm4
71
72/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
73        vpsllq    $63, %zmm1, %zmm5
74        vptestmq  %zmm13, %zmm13, %k0
75        vfnmadd231pd %zmm4, %zmm2, %zmm3
76        kmovw     %k0, %ecx
77        movzbl    %cl, %ecx
78
79/* R = R - N*Pi2 */
80        vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3
81
82/* R = R - N*Pi3 */
83        vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4
84
85/*
86  POLYNOMIAL APPROXIMATION:
87  R2 = R*R
88 */
89        vmulpd    %zmm4, %zmm4, %zmm8
90
91/* R = R^SignRes : update sign of reduced argument */
92        vpxorq    %zmm5, %zmm4, %zmm9
93        vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7
94        vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7
95        vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7
96
97/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
98        vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7
99
100/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
101        vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7
102        vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7
103        vmulpd    %zmm8, %zmm7, %zmm10
104
105/* Poly = Poly*R + R */
106        vfmadd213pd %zmm9, %zmm9, %zmm10
107
108/*
109   RECONSTRUCTION:
110   Final sign setting: Res = Poly^SignX
111 */
112        vpxorq    %zmm11, %zmm10, %zmm1
113        testl     %ecx, %ecx
114        jne       .LBL_1_3
115
116.LBL_1_2:
117        cfi_remember_state
118        vmovaps   %zmm1, %zmm0
119        movq      %rbp, %rsp
120        cfi_def_cfa_register (%rsp)
121        popq      %rbp
122        cfi_adjust_cfa_offset (-8)
123        cfi_restore (%rbp)
124        ret
125
126.LBL_1_3:
127        cfi_restore_state
128        vmovups   %zmm0, 1152(%rsp)
129        vmovups   %zmm1, 1216(%rsp)
130        je        .LBL_1_2
131
132        xorb      %dl, %dl
133        kmovw     %k4, 1048(%rsp)
134        xorl      %eax, %eax
135        kmovw     %k5, 1040(%rsp)
136        kmovw     %k6, 1032(%rsp)
137        kmovw     %k7, 1024(%rsp)
138        vmovups   %zmm16, 960(%rsp)
139        vmovups   %zmm17, 896(%rsp)
140        vmovups   %zmm18, 832(%rsp)
141        vmovups   %zmm19, 768(%rsp)
142        vmovups   %zmm20, 704(%rsp)
143        vmovups   %zmm21, 640(%rsp)
144        vmovups   %zmm22, 576(%rsp)
145        vmovups   %zmm23, 512(%rsp)
146        vmovups   %zmm24, 448(%rsp)
147        vmovups   %zmm25, 384(%rsp)
148        vmovups   %zmm26, 320(%rsp)
149        vmovups   %zmm27, 256(%rsp)
150        vmovups   %zmm28, 192(%rsp)
151        vmovups   %zmm29, 128(%rsp)
152        vmovups   %zmm30, 64(%rsp)
153        vmovups   %zmm31, (%rsp)
154        movq      %rsi, 1064(%rsp)
155        movq      %rdi, 1056(%rsp)
156        movq      %r12, 1096(%rsp)
157        cfi_offset_rel_rsp (12, 1096)
158        movb      %dl, %r12b
159        movq      %r13, 1088(%rsp)
160        cfi_offset_rel_rsp (13, 1088)
161        movl      %ecx, %r13d
162        movq      %r14, 1080(%rsp)
163        cfi_offset_rel_rsp (14, 1080)
164        movl      %eax, %r14d
165        movq      %r15, 1072(%rsp)
166        cfi_offset_rel_rsp (15, 1072)
167        cfi_remember_state
168
169.LBL_1_6:
170        btl       %r14d, %r13d
171        jc        .LBL_1_12
172
173.LBL_1_7:
174        lea       1(%r14), %esi
175        btl       %esi, %r13d
176        jc        .LBL_1_10
177
178.LBL_1_8:
179        addb      $1, %r12b
180        addl      $2, %r14d
181        cmpb      $16, %r12b
182        jb        .LBL_1_6
183
184        kmovw     1048(%rsp), %k4
185        movq      1064(%rsp), %rsi
186        kmovw     1040(%rsp), %k5
187        movq      1056(%rsp), %rdi
188        kmovw     1032(%rsp), %k6
189        movq      1096(%rsp), %r12
190        cfi_restore (%r12)
191        movq      1088(%rsp), %r13
192        cfi_restore (%r13)
193        kmovw     1024(%rsp), %k7
194        vmovups   960(%rsp), %zmm16
195        vmovups   896(%rsp), %zmm17
196        vmovups   832(%rsp), %zmm18
197        vmovups   768(%rsp), %zmm19
198        vmovups   704(%rsp), %zmm20
199        vmovups   640(%rsp), %zmm21
200        vmovups   576(%rsp), %zmm22
201        vmovups   512(%rsp), %zmm23
202        vmovups   448(%rsp), %zmm24
203        vmovups   384(%rsp), %zmm25
204        vmovups   320(%rsp), %zmm26
205        vmovups   256(%rsp), %zmm27
206        vmovups   192(%rsp), %zmm28
207        vmovups   128(%rsp), %zmm29
208        vmovups   64(%rsp), %zmm30
209        vmovups   (%rsp), %zmm31
210        movq      1080(%rsp), %r14
211        cfi_restore (%r14)
212        movq      1072(%rsp), %r15
213        cfi_restore (%r15)
214        vmovups   1216(%rsp), %zmm1
215        jmp       .LBL_1_2
216
217.LBL_1_10:
218        cfi_restore_state
219        movzbl    %r12b, %r15d
220        shlq      $4, %r15
221        vmovsd    1160(%rsp,%r15), %xmm0
222        call      JUMPTARGET(sin)
223        vmovsd    %xmm0, 1224(%rsp,%r15)
224        jmp       .LBL_1_8
225
226.LBL_1_12:
227        movzbl    %r12b, %r15d
228        shlq      $4, %r15
229        vmovsd    1152(%rsp,%r15), %xmm0
230        call      JUMPTARGET(sin)
231        vmovsd    %xmm0, 1216(%rsp,%r15)
232        jmp       .LBL_1_7
233END (_ZGVeN8v_sin_knl)
234
235ENTRY (_ZGVeN8v_sin_skx)
236/*
237   ALGORITHM DESCRIPTION:
238
239      ( low accuracy ( < 4ulp ) or enhanced performance
240       ( half of correct mantissa ) implementation )
241
242      Argument representation:
243      arg = N*Pi + R
244
245      Result calculation:
246      sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
247      sin(R) is approximated by corresponding polynomial
248 */
249        pushq     %rbp
250        cfi_adjust_cfa_offset (8)
251        cfi_rel_offset (%rbp, 0)
252        movq      %rsp, %rbp
253        cfi_def_cfa_register (%rbp)
254        andq      $-64, %rsp
255        subq      $1280, %rsp
256        movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
257        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
258        vmovups __dAbsMask(%rax), %zmm7
259        vmovups __dInvPI(%rax), %zmm2
260        vmovups __dRShifter(%rax), %zmm1
261        vmovups __dPI1_FMA(%rax), %zmm3
262        vmovups __dC7_sin(%rax), %zmm8
263
264/*
265  ARGUMENT RANGE REDUCTION:
266  X' = |X|
267 */
268        vandpd    %zmm7, %zmm0, %zmm13
269
270/* SignX - sign bit of X */
271        vandnpd   %zmm0, %zmm7, %zmm12
272
273/* Y = X'*InvPi + RS : right shifter add */
274        vfmadd213pd %zmm1, %zmm13, %zmm2
275        vcmppd    $18, __dRangeVal(%rax), %zmm13, %k1
276
277/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
278        vpsllq    $63, %zmm2, %zmm6
279
280/* N = Y - RS : right shifter sub */
281        vsubpd    %zmm1, %zmm2, %zmm5
282
283/* R = X' - N*Pi1 */
284        vmovaps   %zmm13, %zmm4
285        vfnmadd231pd %zmm5, %zmm3, %zmm4
286
287/* R = R - N*Pi2 */
288        vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4
289
290/* R = R - N*Pi3 */
291        vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5
292
293/*
294  POLYNOMIAL APPROXIMATION:
295  R2 = R*R
296 */
297        vmulpd    %zmm5, %zmm5, %zmm9
298
299/* R = R^SignRes : update sign of reduced argument */
300        vxorpd    %zmm6, %zmm5, %zmm10
301        vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8
302        vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8
303        vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8
304
305/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
306        vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8
307
308/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
309        vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8
310        vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8
311        vmulpd    %zmm9, %zmm8, %zmm11
312
313/* Poly = Poly*R + R */
314        vfmadd213pd %zmm10, %zmm10, %zmm11
315
316/*
317  RECONSTRUCTION:
318  Final sign setting: Res = Poly^SignX
319 */
320        vxorpd    %zmm12, %zmm11, %zmm1
321        vpandnq   %zmm13, %zmm13, %zmm14{%k1}
322        vcmppd    $3, %zmm14, %zmm14, %k0
323        kmovw     %k0, %ecx
324        testl     %ecx, %ecx
325        jne       .LBL_2_3
326
327.LBL_2_2:
328        cfi_remember_state
329        vmovaps   %zmm1, %zmm0
330        movq      %rbp, %rsp
331        cfi_def_cfa_register (%rsp)
332        popq      %rbp
333        cfi_adjust_cfa_offset (-8)
334        cfi_restore (%rbp)
335        ret
336
337.LBL_2_3:
338        cfi_restore_state
339        vmovups   %zmm0, 1152(%rsp)
340        vmovups   %zmm1, 1216(%rsp)
341        je        .LBL_2_2
342
343        xorb      %dl, %dl
344        xorl      %eax, %eax
345        kmovw     %k4, 1048(%rsp)
346        kmovw     %k5, 1040(%rsp)
347        kmovw     %k6, 1032(%rsp)
348        kmovw     %k7, 1024(%rsp)
349        vmovups   %zmm16, 960(%rsp)
350        vmovups   %zmm17, 896(%rsp)
351        vmovups   %zmm18, 832(%rsp)
352        vmovups   %zmm19, 768(%rsp)
353        vmovups   %zmm20, 704(%rsp)
354        vmovups   %zmm21, 640(%rsp)
355        vmovups   %zmm22, 576(%rsp)
356        vmovups   %zmm23, 512(%rsp)
357        vmovups   %zmm24, 448(%rsp)
358        vmovups   %zmm25, 384(%rsp)
359        vmovups   %zmm26, 320(%rsp)
360        vmovups   %zmm27, 256(%rsp)
361        vmovups   %zmm28, 192(%rsp)
362        vmovups   %zmm29, 128(%rsp)
363        vmovups   %zmm30, 64(%rsp)
364        vmovups   %zmm31, (%rsp)
365        movq      %rsi, 1064(%rsp)
366        movq      %rdi, 1056(%rsp)
367        movq      %r12, 1096(%rsp)
368        cfi_offset_rel_rsp (12, 1096)
369        movb      %dl, %r12b
370        movq      %r13, 1088(%rsp)
371        cfi_offset_rel_rsp (13, 1088)
372        movl      %ecx, %r13d
373        movq      %r14, 1080(%rsp)
374        cfi_offset_rel_rsp (14, 1080)
375        movl      %eax, %r14d
376        movq      %r15, 1072(%rsp)
377        cfi_offset_rel_rsp (15, 1072)
378        cfi_remember_state
379
380.LBL_2_6:
381        btl       %r14d, %r13d
382        jc        .LBL_2_12
383
384.LBL_2_7:
385        lea       1(%r14), %esi
386        btl       %esi, %r13d
387        jc        .LBL_2_10
388
389.LBL_2_8:
390        incb      %r12b
391        addl      $2, %r14d
392        cmpb      $16, %r12b
393        jb        .LBL_2_6
394
395        kmovw     1048(%rsp), %k4
396        kmovw     1040(%rsp), %k5
397        kmovw     1032(%rsp), %k6
398        kmovw     1024(%rsp), %k7
399        vmovups   960(%rsp), %zmm16
400        vmovups   896(%rsp), %zmm17
401        vmovups   832(%rsp), %zmm18
402        vmovups   768(%rsp), %zmm19
403        vmovups   704(%rsp), %zmm20
404        vmovups   640(%rsp), %zmm21
405        vmovups   576(%rsp), %zmm22
406        vmovups   512(%rsp), %zmm23
407        vmovups   448(%rsp), %zmm24
408        vmovups   384(%rsp), %zmm25
409        vmovups   320(%rsp), %zmm26
410        vmovups   256(%rsp), %zmm27
411        vmovups   192(%rsp), %zmm28
412        vmovups   128(%rsp), %zmm29
413        vmovups   64(%rsp), %zmm30
414        vmovups   (%rsp), %zmm31
415        vmovups   1216(%rsp), %zmm1
416        movq      1064(%rsp), %rsi
417        movq      1056(%rsp), %rdi
418        movq      1096(%rsp), %r12
419        cfi_restore (%r12)
420        movq      1088(%rsp), %r13
421        cfi_restore (%r13)
422        movq      1080(%rsp), %r14
423        cfi_restore (%r14)
424        movq      1072(%rsp), %r15
425        cfi_restore (%r15)
426        jmp       .LBL_2_2
427
428.LBL_2_10:
429        cfi_restore_state
430        movzbl    %r12b, %r15d
431        shlq      $4, %r15
432        vmovsd    1160(%rsp,%r15), %xmm0
433        vzeroupper
434        vmovsd    1160(%rsp,%r15), %xmm0
435
436        call      JUMPTARGET(sin)
437
438        vmovsd    %xmm0, 1224(%rsp,%r15)
439        jmp       .LBL_2_8
440
441.LBL_2_12:
442        movzbl    %r12b, %r15d
443        shlq      $4, %r15
444        vmovsd    1152(%rsp,%r15), %xmm0
445        vzeroupper
446        vmovsd    1152(%rsp,%r15), %xmm0
447
448        call      JUMPTARGET(sin)
449
450        vmovsd    %xmm0, 1216(%rsp,%r15)
451        jmp       .LBL_2_7
452END (_ZGVeN8v_sin_skx)
453