1/* Function atan vectorized with SSE4. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) 23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) 24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) 25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) 26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x 27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. 28 * 29 */ 30 31/* Offsets for data table __svml_datan_data_internal_avx512 32 */ 33#define AbsMask 0 34#define Shifter 16 35#define MaxThreshold 32 36#define MOne 48 37#define One 64 38#define LargeX 80 39#define Zero 96 40#define Tbl_H 112 41#define Tbl_L 368 42#define dIndexMed 624 43#define Pi2 640 44#define Pi2_low 656 45#define coeff 672 46 47#include <sysdep.h> 48 49 .section .text.sse4, "ax", @progbits 50ENTRY(_ZGVbN2v_atan_sse4) 51 lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx 52 movups __svml_datan_data_internal_avx512(%rip), %xmm4 53 movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3 54 andps %xmm0, %xmm4 55 movaps %xmm3, %xmm12 56 movaps %xmm4, %xmm5 57 addpd %xmm4, %xmm12 58 movaps %xmm12, %xmm7 59 60 /* 61 * table lookup sequence 62 * VPERMUTE not available 63 */ 64 movaps %xmm12, %xmm10 65 subpd %xmm3, %xmm7 66 subpd %xmm7, %xmm5 67 mulpd %xmm4, %xmm7 68 movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2 69 psllq $3, %xmm10 70 71 /* saturate X range */ 72 movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8 73 pxor %xmm4, %xmm0 74 cmplepd %xmm4, %xmm2 75 addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7 76 minpd %xmm4, %xmm8 77 movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6 78 movaps %xmm2, %xmm1 79 movaps %xmm2, %xmm9 80 andnps %xmm5, %xmm1 81 andps %xmm2, %xmm6 82 andnps %xmm7, %xmm9 83 andps %xmm2, %xmm8 84 orps %xmm6, %xmm1 85 orps %xmm8, %xmm9 86 87 /* R+Rl = DiffX/Y */ 88 divpd %xmm9, %xmm1 89 pand .FLT_11(%rip), %xmm10 90 91 /* set table value to Pi/2 for large X */ 92 movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4 93 movd %xmm10, %eax 94 andps %xmm2, %xmm4 95 pshufd $2, %xmm10, %xmm11 96 movaps %xmm2, %xmm10 97 98 /* polynomial evaluation */ 99 movaps %xmm1, %xmm2 100 mulpd %xmm1, %xmm2 101 movd %xmm11, %edx 102 movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5 103 movaps %xmm2, %xmm7 104 movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6 105 movaps %xmm2, %xmm9 106 mulpd %xmm2, %xmm5 107 mulpd %xmm2, %xmm7 108 addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5 109 mulpd %xmm2, %xmm6 110 mulpd %xmm7, %xmm5 111 addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6 112 mulpd %xmm1, %xmm9 113 addpd %xmm5, %xmm6 114 movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8 115 mulpd %xmm2, %xmm8 116 mulpd %xmm6, %xmm7 117 addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8 118 addpd %xmm7, %xmm8 119 mulpd %xmm8, %xmm9 120 movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14 121 cmplepd %xmm12, %xmm14 122 addpd %xmm9, %xmm1 123 movslq %eax, %rax 124 movaps %xmm14, %xmm3 125 movslq %edx, %rdx 126 movsd -128(%rax, %rcx), %xmm13 127 movsd (%rcx, %rax), %xmm15 128 movhpd -128(%rdx, %rcx), %xmm13 129 movhpd (%rcx, %rdx), %xmm15 130 andnps %xmm13, %xmm3 131 andps %xmm14, %xmm15 132 orps %xmm15, %xmm3 133 andnps %xmm3, %xmm10 134 orps %xmm4, %xmm10 135 addpd %xmm1, %xmm10 136 pxor %xmm10, %xmm0 137 ret 138 139END(_ZGVbN2v_atan_sse4) 140 141 .section .rodata, "a" 142 .align 16 143 144#ifdef __svml_datan_data_internal_avx512_typedef 145typedef unsigned int VUINT32; 146typedef struct { 147 __declspec(align(16)) VUINT32 AbsMask[2][2]; 148 __declspec(align(16)) VUINT32 Shifter[2][2]; 149 __declspec(align(16)) VUINT32 MaxThreshold[2][2]; 150 __declspec(align(16)) VUINT32 MOne[2][2]; 151 __declspec(align(16)) VUINT32 One[2][2]; 152 __declspec(align(16)) VUINT32 LargeX[2][2]; 153 __declspec(align(16)) VUINT32 Zero[2][2]; 154 __declspec(align(16)) VUINT32 Tbl_H[32][2]; 155 __declspec(align(16)) VUINT32 Tbl_L[32][2]; 156 __declspec(align(16)) VUINT32 dIndexMed[2][2]; 157 __declspec(align(16)) VUINT32 Pi2[2][2]; 158 __declspec(align(16)) VUINT32 Pi2_low[2][2]; 159 __declspec(align(16)) VUINT32 coeff[6][2][2]; 160} __svml_datan_data_internal_avx512; 161#endif 162__svml_datan_data_internal_avx512: 163 /* AbsMask */ 164 .quad 0x7fffffffffffffff, 0x7fffffffffffffff 165 /* Shifter */ 166 .align 16 167 .quad 0x4318000000000000, 0x4318000000000000 168 /* MaxThreshold */ 169 .align 16 170 .quad 0x401f800000000000, 0x401f800000000000 171 /* MOne */ 172 .align 16 173 .quad 0xbff0000000000000, 0xbff0000000000000 174 /* One */ 175 .align 16 176 .quad 0x3ff0000000000000, 0x3ff0000000000000 177 /* LargeX */ 178 .align 16 179 .quad 0x47f0000000000000, 0x47f0000000000000 180 /* Zero */ 181 .align 16 182 .quad 0x0000000000000000, 0x0000000000000000 183 /* Tbl_H */ 184 .align 16 185 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd 186 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1 187 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e 188 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f 189 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25 190 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353 191 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0 192 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617 193 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7 194 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd 195 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89 196 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06 197 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053 198 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195 199 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec 200 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4 201 /* Tbl_L */ 202 .align 16 203 .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd 204 .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458 205 .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b 206 .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70 207 .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb 208 .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c 209 .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4 210 .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e 211 .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b 212 .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d 213 .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5 214 .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f 215 .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3 216 .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2 217 .quad 0xbc929c86447928e7, 0xbc8957a7170df016 218 .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b 219 /* dIndexMed */ 220 .align 16 221 .quad 0x4318000000000010, 0x4318000000000010 222 /* Pi2 */ 223 .align 16 224 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18 225 /* Pi2_low */ 226 .align 16 227 .quad 0x3c91a62633145c07, 0x3c91a62633145c07 228 /* coeff6 */ 229 .align 16 230 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97 231 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc 232 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0 233 .quad 0xbfc249248eef04da, 0xbfc249248eef04da 234 .quad 0x3fc999999998741e, 0x3fc999999998741e 235 .quad 0xbfd555555555554d, 0xbfd555555555554d 236 .align 16 237 .type __svml_datan_data_internal_avx512, @object 238 .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512 239 .align 16 240 241.FLT_11: 242 .long 0x00000078, 0x00000000, 0x00000078, 0x00000000 243 .type .FLT_11, @object 244 .size .FLT_11, 16 245