1/* Function cbrt vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52 23 * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5], 24 * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision 25 * cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5] 26 * (T stores the high 53 bits, D stores the low order bits) 27 * Result=2^k*T+(2^k*T*r)*P+2^k*D 28 * where P=p1+p2*r+..+p8*r^7 29 * 30 */ 31 32/* Offsets for data table __svml_dcbrt_data_internal_avx512 33 */ 34#define etbl_H 0 35#define etbl_L 64 36#define cbrt_tbl_H 128 37#define BiasL 256 38#define SZero 320 39#define OneThird 384 40#define Bias3 448 41#define Three 512 42#define One 576 43#define poly_coeff10 640 44#define poly_coeff9 704 45#define poly_coeff8 768 46#define poly_coeff7 832 47#define poly_coeff6 896 48#define poly_coeff5 960 49#define poly_coeff4 1024 50#define poly_coeff3 1088 51#define poly_coeff2 1152 52#define poly_coeff1 1216 53 54#include <sysdep.h> 55 56 .section .text.evex512, "ax", @progbits 57ENTRY(_ZGVeN8v_cbrt_skx) 58 vgetmantpd $0, {sae}, %zmm0, %zmm14 59 60 /* GetExp(x) */ 61 vgetexppd {sae}, %zmm0, %zmm7 62 vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 63 64 /* exponent/3 */ 65 vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 66 vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 67 68 /* Reduced argument: R = DblRcp*Mantissa - 1 */ 69 vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2 70 71 /* exponent%3 (to be used as index) */ 72 vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 73 74 /* DblRcp ~ 1/Mantissa */ 75 vrcp14pd %zmm14, %zmm13 76 vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12 77 vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6 78 79 /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ 80 vrndscalepd $72, {sae}, %zmm13, %zmm15 81 vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10 82 83 /* polynomial */ 84 vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0 85 vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7 86 vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 87 vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2 88 vrndscalepd $9, {sae}, %zmm10, %zmm5 89 90 /* Table lookup */ 91 vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 92 vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 93 vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13 94 vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9 95 vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12 96 vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 97 vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14 98 99 /* Prepare table index */ 100 vpsrlq $49, %zmm15, %zmm1 101 102 /* Table lookup: 2^(exponent%3) */ 103 vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4 104 vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3 105 vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10 106 vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1 107 vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11 108 vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12 109 vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15 110 vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1 111 vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5 112 vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14 113 vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0 114 vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13 115 116 /* Sh*R */ 117 vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2 118 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1 119 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1 120 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1 121 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1 122 123 /* Sl + (Sh*R)*Poly */ 124 vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2 125 126 /* 127 * branch-free 128 * scaled_Th*(Sh+Sl+Sh*R*Poly) 129 */ 130 vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3 131 vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4 132 vorpd %zmm6, %zmm4, %zmm0 133 ret 134 135END(_ZGVeN8v_cbrt_skx) 136 137 .section .rodata, "a" 138 .align 64 139 140#ifdef __svml_dcbrt_data_internal_avx512_typedef 141typedef unsigned int VUINT32; 142typedef struct { 143 __declspec(align(64)) VUINT32 etbl_H[8][2]; 144 __declspec(align(64)) VUINT32 etbl_L[8][2]; 145 __declspec(align(64)) VUINT32 cbrt_tbl_H[16][2]; 146 __declspec(align(64)) VUINT32 BiasL[8][2]; 147 __declspec(align(64)) VUINT32 SZero[8][2]; 148 __declspec(align(64)) VUINT32 OneThird[8][2]; 149 __declspec(align(64)) VUINT32 Bias3[8][2]; 150 __declspec(align(64)) VUINT32 Three[8][2]; 151 __declspec(align(64)) VUINT32 One[8][2]; 152 __declspec(align(64)) VUINT32 poly_coeff10[8][2]; 153 __declspec(align(64)) VUINT32 poly_coeff9[8][2]; 154 __declspec(align(64)) VUINT32 poly_coeff8[8][2]; 155 __declspec(align(64)) VUINT32 poly_coeff7[8][2]; 156 __declspec(align(64)) VUINT32 poly_coeff6[8][2]; 157 __declspec(align(64)) VUINT32 poly_coeff5[8][2]; 158 __declspec(align(64)) VUINT32 poly_coeff4[8][2]; 159 __declspec(align(64)) VUINT32 poly_coeff3[8][2]; 160 __declspec(align(64)) VUINT32 poly_coeff2[8][2]; 161 __declspec(align(64)) VUINT32 poly_coeff1[8][2]; 162} __svml_dcbrt_data_internal_avx512; 163#endif 164__svml_dcbrt_data_internal_avx512: 165 /* etbl_H */ 166 .quad 0x3ff0000000000000 167 .quad 0x3ff428a2f98d728b 168 .quad 0x3ff965fea53d6e3d 169 .quad 0x0000000000000000 170 .quad 0xbff0000000000000 171 .quad 0xbff428a2f98d728b 172 .quad 0xbff965fea53d6e3d 173 .quad 0x0000000000000000 174 /* etbl_L */ 175 .align 64 176 .quad 0x0000000000000000 177 .quad 0xbc7ddc22548ea41e 178 .quad 0xbc9f53e999952f09 179 .quad 0x0000000000000000 180 .quad 0x0000000000000000 181 .quad 0x3c7ddc22548ea41e 182 .quad 0x3c9f53e999952f09 183 .quad 0x0000000000000000 184 /* cbrt_tbl_H */ 185 .align 64 186 .quad 0x3ff428a2f98d728b 187 .quad 0x3ff361f35ca116ff 188 .quad 0x3ff2b6b5edf6b54a 189 .quad 0x3ff220e6dd675180 190 .quad 0x3ff19c3b38e975a8 191 .quad 0x3ff12589c21fb842 192 .quad 0x3ff0ba6ee5f9aad4 193 .quad 0x3ff059123d3a9848 194 .quad 0x3ff0000000000000 195 .quad 0x0000000000000000 196 .quad 0x0000000000000000 197 .quad 0x0000000000000000 198 .quad 0x0000000000000000 199 .quad 0x0000000000000000 200 .quad 0x0000000000000000 201 .quad 0x0000000000000000 202 /* BiasL */ 203 .align 64 204 .quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000 205 /* Zero */ 206 .align 64 207 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 208 /* OneThird */ 209 .align 64 210 .quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556 211 /* Bias3 */ 212 .align 64 213 .quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 214 /* Three */ 215 .align 64 216 .quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000 217 /* One */ 218 .align 64 219 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 220 /* poly_coeff10 */ 221 .align 64 222 .quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62 223 /* poly_coeff9 */ 224 .align 64 225 .quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875 226 /* poly_coeff8 */ 227 .align 64 228 .quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f 229 /* poly_coeff7 */ 230 .align 64 231 .quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914 232 /* poly_coeff6 */ 233 .align 64 234 .quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e 235 /* poly_coeff5 */ 236 .align 64 237 .quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569 238 /* poly_coeff4 */ 239 .align 64 240 .quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e 241 /* poly_coeff3 */ 242 .align 64 243 .quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31 244 /* poly_coeff2 */ 245 .align 64 246 .quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741 247 /* poly_coeff1 */ 248 .align 64 249 .quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557 250 .align 64 251 .type __svml_dcbrt_data_internal_avx512, @object 252 .size __svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512 253