1/* Function atanhf vectorized with AVX2. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) 23 * 24 * Special cases: 25 * 26 * atanh(0) = 0 27 * atanh(+1) = +INF 28 * atanh(-1) = -INF 29 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF 30 * 31 */ 32 33/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered 34 by use in the function. On cold-starts this might hhelp the 35 prefetcher. Possibly a better idea is to interleave start/end so 36 that the prefetcher is less likely to detect a stream and pull 37 irrelivant lines into cache. */ 38#define SgnMask 0 39#define sOne 32 40#define sTopMask12 64 41#define TinyRange 96 42#define iBrkValue 128 43#define iOffExpoMask 160 44#define sPoly 192 45#define sLn2 448 46#define sHalf 480 47 48#include <sysdep.h> 49#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) 50 51 .section .text.avx2, "ax", @progbits 52ENTRY(_ZGVdN8v_atanhf_avx2) 53 /* Strip off the sign, so treat X as positive until right at the end */ 54 vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2 55 vandps %ymm2, %ymm0, %ymm3 56 /* Load constants including One = 1 */ 57 vmovups ATANHF_DATA(sOne)(%rip), %ymm5 58 vsubps %ymm3, %ymm5, %ymm1 59 vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4 60 61 vrcpps %ymm1, %ymm7 62 vsubps %ymm1, %ymm5, %ymm9 63 vandps %ymm4, %ymm7, %ymm6 64 vsubps %ymm3, %ymm9, %ymm7 65 66 /* No need to split sU when FMA is available */ 67 vfnmadd213ps %ymm5, %ymm6, %ymm1 68 vmovaps %ymm0, %ymm8 69 vfmadd213ps %ymm0, %ymm0, %ymm0 70 vfnmadd231ps %ymm6, %ymm7, %ymm1 71 72 /* 73 * Check whether |X| < 1, in which case we use the main function. 74 * Otherwise set the rangemask so that the callout will get used. 75 * Note that this will also use the callout for NaNs since not(NaN < 1). 76 */ 77 vcmpnlt_uqps %ymm5, %ymm3, %ymm14 78 vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15 79 80 /* 81 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, 82 * the upper part UHi being <= 12 bits long. Then we have 83 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). 84 */ 85 vaddps %ymm3, %ymm3, %ymm3 86 87 /* 88 * Split V as well into upper 12 bits and lower part, so that we can get 89 * a preliminary quotient estimate without rounding error. 90 */ 91 vandps %ymm4, %ymm3, %ymm4 92 vsubps %ymm4, %ymm3, %ymm7 93 94 /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ 95 vmulps %ymm4, %ymm6, %ymm4 96 97 /* Compute D = E + E^2 */ 98 vfmadd213ps %ymm1, %ymm1, %ymm1 99 100 /* Record the sign for eventual reincorporation. */ 101 vandnps %ymm8, %ymm2, %ymm3 102 103 /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ 104 vorps %ymm3, %ymm0, %ymm13 105 vmulps %ymm7, %ymm6, %ymm2 106 107 /* 108 * Compute R * (VHi + VLo) * (1 + E + E^2) 109 * = R * (VHi + VLo) * (1 + D) 110 * = QHi + (QHi * D + QLo + QLo * D) 111 */ 112 113 /* 114 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9; 115 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with 116 * `vfmadd231ps %ymm1, %ymm4, %ymm4`. 117 */ 118 vmulps %ymm1, %ymm4, %ymm6 119 vfmadd213ps %ymm2, %ymm2, %ymm1 120 vaddps %ymm1, %ymm6, %ymm1 121 122 /* 123 * Now finally accumulate the high and low parts of the 124 * argument to log1p, H + L, with a final compensated summation. 125 */ 126 vaddps %ymm1, %ymm4, %ymm2 127 128 /* reduction: compute r, n */ 129 vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9 130 131 /* 132 * Now we feed into the log1p code, using H in place of _VARG1 and 133 * later incorporating L into the reduced argument. 134 * compute 1+x as high, low parts 135 */ 136 vmaxps %ymm2, %ymm5, %ymm0 137 vminps %ymm2, %ymm5, %ymm6 138 139 /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */ 140 vsubps %ymm2, %ymm4, %ymm2 141 vaddps %ymm6, %ymm0, %ymm4 142 vpsubd %ymm9, %ymm4, %ymm7 143 vsubps %ymm4, %ymm0, %ymm4 144 vaddps %ymm2, %ymm1, %ymm2 145 vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1 146 147 vandps %ymm1, %ymm7, %ymm0 148 vaddps %ymm4, %ymm6, %ymm4 149 vandnps %ymm7, %ymm1, %ymm6 150 vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1 151 vpaddd %ymm9, %ymm0, %ymm0 152 vaddps %ymm4, %ymm2, %ymm4 153 vpsubd %ymm6, %ymm5, %ymm6 154 155 /* polynomial evaluation */ 156 vsubps %ymm5, %ymm0, %ymm2 157 vfmadd231ps %ymm4, %ymm6, %ymm2 158 vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1 159 vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1 160 vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1 161 vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1 162 vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1 163 vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1 164 vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1 165 166 vmulps %ymm1, %ymm2, %ymm1 167 vfmadd213ps %ymm2, %ymm2, %ymm1 168 169 /* final reconstruction */ 170 vpsrad $23, %ymm7, %ymm6 171 vcvtdq2ps %ymm6, %ymm2 172 vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2 173 174 /* Finally, halve the result and reincorporate the sign */ 175 vxorps ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3 176 vmulps %ymm2, %ymm3, %ymm2 177 vmovmskps %ymm14, %edx 178 testl %edx, %edx 179 180 vblendvps %ymm15, %ymm13, %ymm2, %ymm0 181 /* Go to special inputs processing branch */ 182 jne L(SPECIAL_VALUES_BRANCH) 183 # LOE rbx rdx r12 r13 r14 r15 ymm0 184 /* No registers to restore on fast path. */ 185 ret 186 187 188 /* Cold case. edx has 1s where there was a special value that 189 needs to be handled by a atanhf call. Optimize for code size 190 more so than speed here. */ 191L(SPECIAL_VALUES_BRANCH): 192 # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8 193 /* Use r13 to save/restore the stack. This allows us to use rbp as 194 callee save register saving code size. */ 195 pushq %r13 196 cfi_adjust_cfa_offset(8) 197 cfi_offset(r13, -16) 198 /* Need to callee save registers to preserve state across tanhf calls. 199 */ 200 pushq %rbx 201 cfi_adjust_cfa_offset(8) 202 cfi_offset(rbx, -24) 203 pushq %rbp 204 cfi_adjust_cfa_offset(8) 205 cfi_offset(rbp, -32) 206 movq %rsp, %r13 207 cfi_def_cfa_register(r13) 208 209 /* Align stack and make room for 2x ymm vectors. */ 210 andq $-32, %rsp 211 addq $-64, %rsp 212 213 /* Save all already computed inputs. */ 214 vmovups %ymm0, (%rsp) 215 /* Save original input (ymm8 unchanged up to this point). */ 216 vmovups %ymm8, 32(%rsp) 217 218 vzeroupper 219 220 /* edx has 1s where there was a special value that needs to be handled 221 by a atanhf call. */ 222 movl %edx, %ebx 223L(SPECIAL_VALUES_LOOP): 224 # LOE rbx rbp r12 r13 r14 r15 225 /* use rbp as index for special value that is saved across calls to 226 atanhf. We technically don't need a callee save register here as offset 227 to rsp is always [0, 28] so we can restore rsp by realigning to 64. 228 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions 229 in the loop. Realigning also costs more code size. */ 230 xorl %ebp, %ebp 231 tzcntl %ebx, %ebp 232 233 /* Scalar math fucntion call to process special input. */ 234 vmovss 32(%rsp, %rbp, 4), %xmm0 235 call atanhf@PLT 236 237 /* No good way to avoid the store-forwarding fault this will cause on 238 return. `lfence` avoids the SF fault but at greater cost as it 239 serialized stack/callee save restoration. */ 240 vmovss %xmm0, (%rsp, %rbp, 4) 241 242 blsrl %ebx, %ebx 243 jnz L(SPECIAL_VALUES_LOOP) 244 # LOE r12 r13 r14 r15 245 246 247 /* All results have been written to (%rsp). */ 248 vmovups (%rsp), %ymm0 249 /* Restore rsp. */ 250 movq %r13, %rsp 251 cfi_def_cfa_register(rsp) 252 /* Restore callee save registers. */ 253 popq %rbp 254 cfi_adjust_cfa_offset(-8) 255 cfi_restore(rbp) 256 popq %rbx 257 cfi_adjust_cfa_offset(-8) 258 cfi_restore(rbp) 259 popq %r13 260 cfi_adjust_cfa_offset(-8) 261 cfi_restore(r13) 262 ret 263END(_ZGVdN8v_atanhf_avx2) 264 265 .section .rodata, "a" 266 .align 32 267#ifdef __svml_satanh_data_internal_typedef 268typedef unsigned int VUINT32; 269typedef struct{ 270 __declspec(align(32)) VUINT32 SgnMask[8][1]; 271 __declspec(align(32)) VUINT32 sOne[8][1]; 272 __declspec(align(32)) VUINT32 sTopMask12[8][1]; 273 __declspec(align(32)) VUINT32 TinyRange[8][1]; 274 __declspec(align(32)) VUINT32 iBrkValue[8][1]; 275 __declspec(align(32)) VUINT32 iOffExpoMask[8][1]; 276 __declspec(align(32)) VUINT32 sPoly[8][8][1]; 277 __declspec(align(32)) VUINT32 sLn2[8][1]; 278 __declspec(align(32)) VUINT32 sHalf[8][1]; 279} __svml_satanh_data_internal; 280#endif 281__svml_satanh_data_internal: 282 /* SgnMask */ 283 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 284 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 285 /* sOne = SP 1.0 */ 286 .align 32 287 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 288 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 289 /* sTopMask12 */ 290 .align 32 291 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 292 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 293 /* TinyRange */ 294 .align 32 295 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 296 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 297 /* iBrkValue = SP 2/3 */ 298 .align 32 299 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab 300 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab 301 /* iOffExpoMask = SP significand mask */ 302 .align 32 303 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff 304 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff 305 /* sPoly[] = SP polynomial */ 306 .align 32 307 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed 308 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ 309 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 310 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ 311 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 312 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ 313 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 314 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ 315 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 316 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ 317 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e 318 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ 319 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 320 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ 321 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 322 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ 323 /* sLn2 = SP ln(2) */ 324 .align 32 325 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 326 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 327 /* sHalf */ 328 .align 32 329 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 330 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 331 .align 32 332 .type __svml_satanh_data_internal, @object 333 .size __svml_satanh_data_internal, .-__svml_satanh_data_internal 334