1/* Function exp10 vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * Typical exp10() implementation, except that: 22 * - tables are small (16 elements), allowing for fast gathers 23 * - all arguments processed in the main path 24 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses) 25 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs 26 * - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling 27 * - SAE used to avoid spurious flag settings 28 * 29 */ 30 31/* Offsets for data table __svml_dexp10_data_internal_avx512 32 */ 33#define Exp_tbl_H 0 34#define L2E 128 35#define Shifter 192 36#define L2H 256 37#define L2L 320 38#define EMask 384 39#define poly_coeff6 448 40#define poly_coeff5 512 41#define poly_coeff4 576 42#define poly_coeff3 640 43#define poly_coeff2 704 44#define poly_coeff1 768 45#define AbsMask 832 46#define Threshold 896 47 48#include <sysdep.h> 49 50 .section .text.evex512, "ax", @progbits 51ENTRY(_ZGVeN8v_exp10_skx) 52 pushq %rbp 53 cfi_def_cfa_offset(16) 54 movq %rsp, %rbp 55 cfi_def_cfa(6, 16) 56 cfi_offset(6, -16) 57 andq $-64, %rsp 58 subq $192, %rsp 59 vmovups L2E+__svml_dexp10_data_internal_avx512(%rip), %zmm4 60 vmovups Shifter+__svml_dexp10_data_internal_avx512(%rip), %zmm2 61 vmovups L2H+__svml_dexp10_data_internal_avx512(%rip), %zmm5 62 vmovups L2L+__svml_dexp10_data_internal_avx512(%rip), %zmm3 63 64 /* polynomial */ 65 vmovups poly_coeff6+__svml_dexp10_data_internal_avx512(%rip), %zmm6 66 vmovups poly_coeff4+__svml_dexp10_data_internal_avx512(%rip), %zmm7 67 vmovups poly_coeff3+__svml_dexp10_data_internal_avx512(%rip), %zmm9 68 vmovups poly_coeff2+__svml_dexp10_data_internal_avx512(%rip), %zmm8 69 vmovups poly_coeff1+__svml_dexp10_data_internal_avx512(%rip), %zmm11 70 vmovups Threshold+__svml_dexp10_data_internal_avx512(%rip), %zmm14 71 vmovaps %zmm0, %zmm1 72 73 /* 2^(52-4)*1.5 + x * log2(e) */ 74 vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm4 75 vandpd AbsMask+__svml_dexp10_data_internal_avx512(%rip), %zmm1, %zmm13 76 77 /* Z0 ~ x*log2(e), rounded down to 4 fractional bits */ 78 vsubpd {rn-sae}, %zmm2, %zmm4, %zmm0 79 80 /* Table lookup: Th */ 81 vmovups __svml_dexp10_data_internal_avx512(%rip), %zmm2 82 vcmppd $29, {sae}, %zmm14, %zmm13, %k0 83 84 /* R = x - Z0*log(2) */ 85 vfnmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm5 86 vpermt2pd Exp_tbl_H+64+__svml_dexp10_data_internal_avx512(%rip), %zmm4, %zmm2 87 kmovw %k0, %edx 88 vfnmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm5 89 vmovups poly_coeff5+__svml_dexp10_data_internal_avx512(%rip), %zmm3 90 91 /* ensure |R|<2 even for special cases */ 92 vandpd EMask+__svml_dexp10_data_internal_avx512(%rip), %zmm5, %zmm12 93 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm10 94 vmulpd {rn-sae}, %zmm12, %zmm2, %zmm15 95 vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm3 96 vfmadd231pd {rn-sae}, %zmm12, %zmm7, %zmm9 97 vfmadd231pd {rn-sae}, %zmm12, %zmm8, %zmm11 98 vfmadd213pd {rn-sae}, %zmm9, %zmm10, %zmm3 99 vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm3 100 vfmadd213pd {rn-sae}, %zmm2, %zmm15, %zmm3 101 vscalefpd {rn-sae}, %zmm0, %zmm3, %zmm0 102 testl %edx, %edx 103 104 /* Go to special inputs processing branch */ 105 jne L(SPECIAL_VALUES_BRANCH) 106 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 107 108 /* Restore registers 109 * and exit the function 110 */ 111 112L(EXIT): 113 movq %rbp, %rsp 114 popq %rbp 115 cfi_def_cfa(7, 8) 116 cfi_restore(6) 117 ret 118 cfi_def_cfa(6, 16) 119 cfi_offset(6, -16) 120 121 /* Branch to process 122 * special inputs 123 */ 124 125L(SPECIAL_VALUES_BRANCH): 126 vmovups %zmm1, 64(%rsp) 127 vmovups %zmm0, 128(%rsp) 128 # LOE rbx r12 r13 r14 r15 edx zmm0 129 130 xorl %eax, %eax 131 # LOE rbx r12 r13 r14 r15 eax edx 132 133 vzeroupper 134 movq %r12, 16(%rsp) 135 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 136 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 137 movl %eax, %r12d 138 movq %r13, 8(%rsp) 139 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 140 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 141 movl %edx, %r13d 142 movq %r14, (%rsp) 143 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 144 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 145 # LOE rbx r15 r12d r13d 146 147 /* Range mask 148 * bits check 149 */ 150 151L(RANGEMASK_CHECK): 152 btl %r12d, %r13d 153 154 /* Call scalar math function */ 155 jc L(SCALAR_MATH_CALL) 156 # LOE rbx r15 r12d r13d 157 158 /* Special inputs 159 * processing loop 160 */ 161 162L(SPECIAL_VALUES_LOOP): 163 incl %r12d 164 cmpl $8, %r12d 165 166 /* Check bits in range mask */ 167 jl L(RANGEMASK_CHECK) 168 # LOE rbx r15 r12d r13d 169 170 movq 16(%rsp), %r12 171 cfi_restore(12) 172 movq 8(%rsp), %r13 173 cfi_restore(13) 174 movq (%rsp), %r14 175 cfi_restore(14) 176 vmovups 128(%rsp), %zmm0 177 178 /* Go to exit */ 179 jmp L(EXIT) 180 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 181 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 182 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 183 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 184 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 185 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 186 # LOE rbx r12 r13 r14 r15 zmm0 187 188 /* Scalar math fucntion call 189 * to process special input 190 */ 191 192L(SCALAR_MATH_CALL): 193 movl %r12d, %r14d 194 vmovsd 64(%rsp, %r14, 8), %xmm0 195 call exp10@PLT 196 # LOE rbx r14 r15 r12d r13d xmm0 197 198 vmovsd %xmm0, 128(%rsp, %r14, 8) 199 200 /* Process special inputs in loop */ 201 jmp L(SPECIAL_VALUES_LOOP) 202 # LOE rbx r15 r12d r13d 203END(_ZGVeN8v_exp10_skx) 204 205 .section .rodata, "a" 206 .align 64 207 208#ifdef __svml_dexp10_data_internal_avx512_typedef 209typedef unsigned int VUINT32; 210typedef struct { 211 __declspec(align(64)) VUINT32 Exp_tbl_H[16][2]; 212 __declspec(align(64)) VUINT32 L2E[8][2]; 213 __declspec(align(64)) VUINT32 Shifter[8][2]; 214 __declspec(align(64)) VUINT32 L2H[8][2]; 215 __declspec(align(64)) VUINT32 L2L[8][2]; 216 __declspec(align(64)) VUINT32 EMask[8][2]; 217 __declspec(align(64)) VUINT32 poly_coeff6[8][2]; 218 __declspec(align(64)) VUINT32 poly_coeff5[8][2]; 219 __declspec(align(64)) VUINT32 poly_coeff4[8][2]; 220 __declspec(align(64)) VUINT32 poly_coeff3[8][2]; 221 __declspec(align(64)) VUINT32 poly_coeff2[8][2]; 222 __declspec(align(64)) VUINT32 poly_coeff1[8][2]; 223 __declspec(align(64)) VUINT32 AbsMask[8][2]; 224 __declspec(align(64)) VUINT32 Threshold[8][2]; 225} __svml_dexp10_data_internal_avx512; 226#endif 227__svml_dexp10_data_internal_avx512: 228 /* Exp_tbl_H */ 229 .quad 0x3ff0000000000000 230 .quad 0x3ff0b5586cf9890f 231 .quad 0x3ff172b83c7d517b 232 .quad 0x3ff2387a6e756238 233 .quad 0x3ff306fe0a31b715 234 .quad 0x3ff3dea64c123422 235 .quad 0x3ff4bfdad5362a27 236 .quad 0x3ff5ab07dd485429 237 .quad 0x3ff6a09e667f3bcd 238 .quad 0x3ff7a11473eb0187 239 .quad 0x3ff8ace5422aa0db 240 .quad 0x3ff9c49182a3f090 241 .quad 0x3ffae89f995ad3ad 242 .quad 0x3ffc199bdd85529c 243 .quad 0x3ffd5818dcfba487 244 .quad 0x3ffea4afa2a490da 245 /* log2(e) */ 246 .align 64 247 .quad 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371 248 /* Shifter=2^(52-4)*1.5 */ 249 .align 64 250 .quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0 251 /* L2H = log(2)_high */ 252 .align 64 253 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff 254 /* L2L = log(2)_low */ 255 .align 64 256 .quad 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21 257 /* EMask */ 258 .align 64 259 .quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff 260 /* poly_coeff6 */ 261 .align 64 262 .quad 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020 263 /* poly_coeff5 */ 264 .align 64 265 .quad 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424 266 /* poly_coeff4 */ 267 .align 64 268 .quad 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d 269 /* poly_coeff3 */ 270 .align 64 271 .quad 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8 272 /* poly_coeff2 */ 273 .align 64 274 .quad 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25 275 /* poly_coeff1 */ 276 .align 64 277 .quad 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2 278 /* AbsMask */ 279 .align 64 280 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff 281 /* Threshold */ 282 .align 64 283 .quad 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41 284 .align 64 285 .type __svml_dexp10_data_internal_avx512, @object 286 .size __svml_dexp10_data_internal_avx512, .-__svml_dexp10_data_internal_avx512 287