1/* Function atanf vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) 23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) 24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) 25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) 26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x 27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. 28 * 29 */ 30 31/* Offsets for data table __svml_satan_data_internal_avx512 32 */ 33#define AbsMask 0 34#define Shifter 64 35#define MaxThreshold 128 36#define MOne 192 37#define One 256 38#define LargeX 320 39#define Zero 384 40#define Tbl_H 448 41#define Pi2 576 42#define coeff_1 640 43#define coeff_2 704 44#define coeff_3 768 45 46#include <sysdep.h> 47 48 .section .text.exex512, "ax", @progbits 49ENTRY(_ZGVeN16v_atanf_skx) 50 vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7 51 vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3 52 vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8 53 54 /* round to 2 bits after binary point */ 55 vreduceps $40, {sae}, %zmm7, %zmm5 56 57 /* saturate X range */ 58 vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6 59 vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2 60 vcmpps $29, {sae}, %zmm3, %zmm7, %k1 61 62 /* table lookup sequence */ 63 vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3 64 vsubps {rn-sae}, %zmm5, %zmm7, %zmm4 65 vaddps {rn-sae}, %zmm2, %zmm7, %zmm1 66 vxorps %zmm0, %zmm7, %zmm0 67 vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8 68 vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4 69 70 /* if|X|>=MaxThreshold, set DiffX=-1 */ 71 vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1} 72 vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5 73 74 /* if|X|>=MaxThreshold, set Y=X */ 75 vminps {sae}, %zmm7, %zmm6, %zmm8{%k1} 76 77 /* R+Rl = DiffX/Y */ 78 vgetmantps $0, {sae}, %zmm9, %zmm12 79 vgetexpps {sae}, %zmm9, %zmm10 80 vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3 81 vgetmantps $0, {sae}, %zmm8, %zmm15 82 vgetexpps {sae}, %zmm8, %zmm11 83 vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1 84 85 /* set table value to Pi/2 for large X */ 86 vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1} 87 vrcp14ps %zmm15, %zmm13 88 vsubps {rn-sae}, %zmm11, %zmm10, %zmm2 89 vmulps {rn-sae}, %zmm13, %zmm12, %zmm14 90 vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15 91 vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15 92 vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7 93 94 /* polynomial evaluation */ 95 vmulps {rn-sae}, %zmm7, %zmm7, %zmm8 96 vmulps {rn-sae}, %zmm7, %zmm8, %zmm6 97 vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4 98 vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8 99 vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8 100 vaddps {rn-sae}, %zmm9, %zmm8, %zmm10 101 vxorps %zmm0, %zmm10, %zmm0 102 ret 103 104END(_ZGVeN16v_atanf_skx) 105 106 .section .rodata, "a" 107 .align 64 108 109#ifdef __svml_satan_data_internal_avx512_typedef 110typedef unsigned int VUINT32; 111typedef struct { 112 __declspec(align(64)) VUINT32 AbsMask[16][1]; 113 __declspec(align(64)) VUINT32 Shifter[16][1]; 114 __declspec(align(64)) VUINT32 MaxThreshold[16][1]; 115 __declspec(align(64)) VUINT32 MOne[16][1]; 116 __declspec(align(64)) VUINT32 One[16][1]; 117 __declspec(align(64)) VUINT32 LargeX[16][1]; 118 __declspec(align(64)) VUINT32 Zero[16][1]; 119 __declspec(align(64)) VUINT32 Tbl_H[32][1]; 120 __declspec(align(64)) VUINT32 Pi2[16][1]; 121 __declspec(align(64)) VUINT32 coeff[3][16][1]; 122} __svml_satan_data_internal_avx512; 123#endif 124__svml_satan_data_internal_avx512: 125 /* AbsMask */ 126 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 127 /* Shifter */ 128 .align 64 129 .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000 130 /* MaxThreshold */ 131 .align 64 132 .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000 133 /* MOne */ 134 .align 64 135 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 136 /* One */ 137 .align 64 138 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 139 /* LargeX */ 140 .align 64 141 .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000 142 /* Zero */ 143 .align 64 144 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 145 /* Tbl_H */ 146 .align 64 147 .long 0x00000000, 0x3e7adbb0 148 .long 0x3eed6338, 0x3f24bc7d 149 .long 0x3f490fdb, 0x3f6563e3 150 .long 0x3f7b985f, 0x3f869c79 151 .long 0x3f8db70d, 0x3f93877b 152 .long 0x3f985b6c, 0x3f9c6b53 153 .long 0x3f9fe0bb, 0x3fa2daa4 154 .long 0x3fa57088, 0x3fa7b46f 155 .long 0x3fa9b465, 0x3fab7b7a 156 .long 0x3fad1283, 0x3fae809e 157 .long 0x3fafcb99, 0x3fb0f836 158 .long 0x3fb20a6a, 0x3fb30581 159 .long 0x3fb3ec43, 0x3fb4c10a 160 .long 0x3fb585d7, 0x3fb63c64 161 .long 0x3fb6e62c, 0x3fb78478 162 .long 0x3fb81868, 0x3fb8a2f5 163 /* Pi2 */ 164 .align 64 165 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB 166 /* coeff3 */ 167 .align 64 168 .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de 169 .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2 170 .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa 171 .align 64 172 .type __svml_satan_data_internal_avx512, @object 173 .size __svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512 174