1/* Function atanh vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) 23 * using small lookup table that map to AVX-512 permute instructions 24 * 25 * Special cases: 26 * 27 * atanh(0) = 0 28 * atanh(+1) = +INF 29 * atanh(-1) = -INF 30 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF 31 * 32 */ 33 34/* Offsets for data table __svml_datanh_data_internal_avx512 35 */ 36#define Log_tbl_H 0 37#define Log_tbl_L 128 38#define One 256 39#define AbsMask 320 40#define AddB5 384 41#define RcpBitMask 448 42#define poly_coeff8 512 43#define poly_coeff7 576 44#define poly_coeff6 640 45#define poly_coeff5 704 46#define poly_coeff4 768 47#define poly_coeff3 832 48#define poly_coeff2 896 49#define poly_coeff1 960 50#define poly_coeff0 1024 51#define Half 1088 52#define L2H 1152 53#define L2L 1216 54 55#include <sysdep.h> 56 57 .section .text.evex512, "ax", @progbits 58ENTRY(_ZGVeN8v_atanh_skx) 59 pushq %rbp 60 cfi_def_cfa_offset(16) 61 movq %rsp, %rbp 62 cfi_def_cfa(6, 16) 63 cfi_offset(6, -16) 64 andq $-64, %rsp 65 subq $192, %rsp 66 vmovups One+__svml_datanh_data_internal_avx512(%rip), %zmm15 67 68 /* round reciprocals to 1+4b mantissas */ 69 vmovups AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6 70 vmovups RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9 71 vmovaps %zmm0, %zmm2 72 vandpd AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13 73 74 /* 1+y */ 75 vaddpd {rn-sae}, %zmm15, %zmm13, %zmm0 76 77 /* 1-y */ 78 vsubpd {rn-sae}, %zmm13, %zmm15, %zmm4 79 vxorpd %zmm13, %zmm2, %zmm1 80 81 /* Yp_high */ 82 vsubpd {rn-sae}, %zmm15, %zmm0, %zmm7 83 84 /* -Ym_high */ 85 vsubpd {rn-sae}, %zmm15, %zmm4, %zmm12 86 87 /* RcpP ~ 1/Yp */ 88 vrcp14pd %zmm0, %zmm3 89 90 /* RcpM ~ 1/Ym */ 91 vrcp14pd %zmm4, %zmm5 92 93 /* input outside (-1, 1) ? */ 94 vcmppd $21, {sae}, %zmm15, %zmm13, %k0 95 vpaddq %zmm6, %zmm3, %zmm11 96 vpaddq %zmm6, %zmm5, %zmm10 97 98 /* Yp_low */ 99 vsubpd {rn-sae}, %zmm7, %zmm13, %zmm8 100 vandpd %zmm9, %zmm11, %zmm14 101 vandpd %zmm9, %zmm10, %zmm3 102 103 /* Ym_low */ 104 vaddpd {rn-sae}, %zmm12, %zmm13, %zmm12 105 106 /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */ 107 vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0 108 109 /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */ 110 vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15 111 112 /* exponents */ 113 vgetexppd {sae}, %zmm14, %zmm5 114 vgetexppd {sae}, %zmm3, %zmm4 115 116 /* Table lookups */ 117 vmovups __svml_datanh_data_internal_avx512(%rip), %zmm9 118 vmovups Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13 119 vmovups Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7 120 vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0 121 vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15 122 123 /* Prepare table index */ 124 vpsrlq $48, %zmm14, %zmm11 125 vpsrlq $48, %zmm3, %zmm8 126 vmovups Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14 127 128 /* polynomials */ 129 vmovups poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3 130 131 /* Km-Kp */ 132 vsubpd {rn-sae}, %zmm5, %zmm4, %zmm5 133 vmovups poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4 134 kmovw %k0, %edx 135 vmovaps %zmm11, %zmm10 136 vmovaps %zmm4, %zmm6 137 vpermi2pd %zmm13, %zmm9, %zmm10 138 vpermi2pd %zmm14, %zmm7, %zmm11 139 vpermt2pd %zmm13, %zmm8, %zmm9 140 vpermt2pd %zmm14, %zmm8, %zmm7 141 vmovups poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8 142 vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6 143 vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4 144 vmovups poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13 145 vmovups poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14 146 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6 147 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4 148 vmovups poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8 149 vsubpd {rn-sae}, %zmm11, %zmm7, %zmm12 150 151 /* table values */ 152 vsubpd {rn-sae}, %zmm10, %zmm9, %zmm3 153 vmovups poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7 154 vmovups poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9 155 156 /* K*L2H + Th */ 157 vmovups L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10 158 159 /* K*L2L + Tl */ 160 vmovups L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11 161 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6 162 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4 163 vmovups poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7 164 vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3 165 vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5 166 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6 167 vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4 168 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6 169 vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4 170 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6 171 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4 172 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6 173 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4 174 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6 175 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4 176 177 /* (K*L2L + Tl) + Rp*PolyP */ 178 vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6 179 vorpd Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0 180 181 /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */ 182 vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4 183 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm1 184 vmulpd {rn-sae}, %zmm0, %zmm1, %zmm0 185 testl %edx, %edx 186 187 /* Go to special inputs processing branch */ 188 jne L(SPECIAL_VALUES_BRANCH) 189 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2 190 191 /* Restore registers 192 * and exit the function 193 */ 194 195L(EXIT): 196 movq %rbp, %rsp 197 popq %rbp 198 cfi_def_cfa(7, 8) 199 cfi_restore(6) 200 ret 201 cfi_def_cfa(6, 16) 202 cfi_offset(6, -16) 203 204 /* Branch to process 205 * special inputs 206 */ 207 208L(SPECIAL_VALUES_BRANCH): 209 vmovups %zmm2, 64(%rsp) 210 vmovups %zmm0, 128(%rsp) 211 # LOE rbx r12 r13 r14 r15 edx zmm0 212 213 xorl %eax, %eax 214 # LOE rbx r12 r13 r14 r15 eax edx 215 216 vzeroupper 217 movq %r12, 16(%rsp) 218 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 219 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 220 movl %eax, %r12d 221 movq %r13, 8(%rsp) 222 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 223 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 224 movl %edx, %r13d 225 movq %r14, (%rsp) 226 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 227 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 228 # LOE rbx r15 r12d r13d 229 230 /* Range mask 231 * bits check 232 */ 233 234L(RANGEMASK_CHECK): 235 btl %r12d, %r13d 236 237 /* Call scalar math function */ 238 jc L(SCALAR_MATH_CALL) 239 # LOE rbx r15 r12d r13d 240 241 /* Special inputs 242 * processing loop 243 */ 244 245L(SPECIAL_VALUES_LOOP): 246 incl %r12d 247 cmpl $8, %r12d 248 249 /* Check bits in range mask */ 250 jl L(RANGEMASK_CHECK) 251 # LOE rbx r15 r12d r13d 252 253 movq 16(%rsp), %r12 254 cfi_restore(12) 255 movq 8(%rsp), %r13 256 cfi_restore(13) 257 movq (%rsp), %r14 258 cfi_restore(14) 259 vmovups 128(%rsp), %zmm0 260 261 /* Go to exit */ 262 jmp L(EXIT) 263 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 264 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 265 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 266 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 267 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 268 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 269 # LOE rbx r12 r13 r14 r15 zmm0 270 271 /* Scalar math fucntion call 272 * to process special input 273 */ 274 275L(SCALAR_MATH_CALL): 276 movl %r12d, %r14d 277 vmovsd 64(%rsp, %r14, 8), %xmm0 278 call atanh@PLT 279 # LOE rbx r14 r15 r12d r13d xmm0 280 281 vmovsd %xmm0, 128(%rsp, %r14, 8) 282 283 /* Process special inputs in loop */ 284 jmp L(SPECIAL_VALUES_LOOP) 285 # LOE rbx r15 r12d r13d 286END(_ZGVeN8v_atanh_skx) 287 288 .section .rodata, "a" 289 .align 64 290 291#ifdef __svml_datanh_data_internal_avx512_typedef 292typedef unsigned int VUINT32; 293typedef struct { 294 __declspec(align(64)) VUINT32 Log_tbl_H[16][2]; 295 __declspec(align(64)) VUINT32 Log_tbl_L[16][2]; 296 __declspec(align(64)) VUINT32 One[8][2]; 297 __declspec(align(64)) VUINT32 AbsMask[8][2]; 298 __declspec(align(64)) VUINT32 AddB5[8][2]; 299 __declspec(align(64)) VUINT32 RcpBitMask[8][2]; 300 __declspec(align(64)) VUINT32 poly_coeff8[8][2]; 301 __declspec(align(64)) VUINT32 poly_coeff7[8][2]; 302 __declspec(align(64)) VUINT32 poly_coeff6[8][2]; 303 __declspec(align(64)) VUINT32 poly_coeff5[8][2]; 304 __declspec(align(64)) VUINT32 poly_coeff4[8][2]; 305 __declspec(align(64)) VUINT32 poly_coeff3[8][2]; 306 __declspec(align(64)) VUINT32 poly_coeff2[8][2]; 307 __declspec(align(64)) VUINT32 poly_coeff1[8][2]; 308 __declspec(align(64)) VUINT32 poly_coeff0[8][2]; 309 __declspec(align(64)) VUINT32 Half[8][2]; 310 __declspec(align(64)) VUINT32 L2H[8][2]; 311 __declspec(align(64)) VUINT32 L2L[8][2]; 312} __svml_datanh_data_internal_avx512; 313#endif 314__svml_datanh_data_internal_avx512: 315 /* Log_tbl_H */ 316 .quad 0x0000000000000000 317 .quad 0x3faf0a30c0100000 318 .quad 0x3fbe27076e2a0000 319 .quad 0x3fc5ff3070a80000 320 .quad 0x3fcc8ff7c79b0000 321 .quad 0x3fd1675cabab8000 322 .quad 0x3fd4618bc21c8000 323 .quad 0x3fd739d7f6bc0000 324 .quad 0x3fd9f323ecbf8000 325 .quad 0x3fdc8ff7c79a8000 326 .quad 0x3fdf128f5faf0000 327 .quad 0x3fe0be72e4254000 328 .quad 0x3fe1e85f5e704000 329 .quad 0x3fe307d7334f0000 330 .quad 0x3fe41d8fe8468000 331 .quad 0x3fe52a2d265bc000 332 /* Log_tbl_L */ 333 .align 64 334 .quad 0x0000000000000000 335 .quad 0x3d662a6617cc9717 336 .quad 0x3d6e5cbd3d50fffc 337 .quad 0xbd6b0b0de3077d7e 338 .quad 0xbd697794f689f843 339 .quad 0x3d630701ce63eab9 340 .quad 0xbd609ec17a426426 341 .quad 0xbd67fcb18ed9d603 342 .quad 0x3d584bf2b68d766f 343 .quad 0x3d5a21ac25d81ef3 344 .quad 0x3d3bb2cd720ec44c 345 .quad 0xbd657d49676844cc 346 .quad 0x3d1a07bd8b34be7c 347 .quad 0x3d60be1fb590a1f5 348 .quad 0xbd5aa33736867a17 349 .quad 0x3d46abb9df22bc57 350 /* One */ 351 .align 64 352 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 353 /* AbsMask */ 354 .align 64 355 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff 356 /* AddB5 */ 357 .align 64 358 .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000 359 /* RcpBitMask */ 360 .align 64 361 .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000 362 /* poly_coeff8 */ 363 .align 64 364 .quad 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142 365 /* poly_coeff7 */ 366 .align 64 367 .quad 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70 368 /* poly_coeff6 */ 369 .align 64 370 .quad 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8 371 /* poly_coeff5 */ 372 .align 64 373 .quad 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5 374 /* poly_coeff4 */ 375 .align 64 376 .quad 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a 377 /* poly_coeff3 */ 378 .align 64 379 .quad 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01 380 /* poly_coeff2 */ 381 .align 64 382 .quad 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462 383 /* poly_coeff1 */ 384 .align 64 385 .quad 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5 386 /* poly_coeff0 */ 387 .align 64 388 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 389 /* Half */ 390 .align 64 391 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 392 /* L2H = log(2)_high */ 393 .align 64 394 .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000 395 /* L2L = log(2)_low */ 396 .align 64 397 .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000 398 .align 64 399 .type __svml_datanh_data_internal_avx512, @object 400 .size __svml_datanh_data_internal_avx512, .-__svml_datanh_data_internal_avx512 401