1/* Function atanhf vectorized with SSE4. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) 23 * 24 * Special cases: 25 * 26 * atanh(0) = 0 27 * atanh(+1) = +INF 28 * atanh(-1) = -INF 29 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF 30 * 31 */ 32 33/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered 34 by use in the function. On cold-starts this might help the 35 prefetcher. Possibly a better idea is to interleave start/end so 36 that the prefetcher is less likely to detect a stream and pull 37 irrelivant lines into cache. */ 38#define sOne 0 39#define SgnMask 16 40#define sTopMask12 32 41#define iBrkValue 48 42#define iOffExpoMask 64 43#define sPoly 80 44#define sLn2 208 45#define TinyRange 224 46 47#include <sysdep.h> 48#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) 49 50 .section .text.sse4, "ax", @progbits 51ENTRY(_ZGVbN4v_atanhf_sse4) 52 movaps %xmm0, %xmm5 53 54 /* Load constants including One = 1 */ 55 movups ATANHF_DATA(sOne)(%rip), %xmm4 56 movaps %xmm5, %xmm3 57 58 /* Strip off the sign, so treat X as positive until right at the end */ 59 movups ATANHF_DATA(SgnMask)(%rip), %xmm1 60 movaps %xmm4, %xmm2 61 andps %xmm1, %xmm0 62 movaps %xmm4, %xmm10 63 movups ATANHF_DATA(sTopMask12)(%rip), %xmm11 64 movaps %xmm4, %xmm14 65 movaps %xmm11, %xmm9 66 67 68 /* 69 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, 70 * the upper part UHi being <= 12 bits long. Then we have 71 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). 72 */ 73 movaps %xmm0, %xmm6 74 mulps %xmm5, %xmm3 75 subps %xmm0, %xmm2 76 addps %xmm0, %xmm6 77 subps %xmm2, %xmm10 78 addps %xmm5, %xmm3 79 subps %xmm0, %xmm10 80 andps %xmm2, %xmm9 81 82 83 /* 84 * Check whether |X| < 1, in which case we use the main function. 85 * Otherwise set the rangemask so that the callout will get used. 86 * Note that this will also use the callout for NaNs since not(NaN < 1). 87 */ 88 rcpps %xmm9, %xmm7 89 subps %xmm9, %xmm2 90 andps %xmm11, %xmm7 91 92 93 /* 94 * Split V as well into upper 12 bits and lower part, so that we can get 95 * a preliminary quotient estimate without rounding error. 96 */ 97 andps %xmm6, %xmm11 98 mulps %xmm7, %xmm9 99 addps %xmm2, %xmm10 100 subps %xmm11, %xmm6 101 102 /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ 103 mulps %xmm7, %xmm11 104 mulps %xmm7, %xmm10 105 subps %xmm9, %xmm14 106 mulps %xmm6, %xmm7 107 subps %xmm10, %xmm14 108 109 /* Compute D = E + E^2 */ 110 movaps %xmm14, %xmm13 111 movaps %xmm4, %xmm8 112 mulps %xmm14, %xmm13 113 114 /* reduction: compute r,n */ 115 movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9 116 addps %xmm13, %xmm14 117 118 /* 119 * Compute R * (VHi + VLo) * (1 + E + E^2) 120 * = R * (VHi + VLo) * (1 + D) 121 * = QHi + (QHi * D + QLo + QLo * D) 122 */ 123 movaps %xmm14, %xmm2 124 mulps %xmm7, %xmm14 125 mulps %xmm11, %xmm2 126 addps %xmm14, %xmm7 127 movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12 128 movaps %xmm4, %xmm14 129 130 /* Record the sign for eventual reincorporation. */ 131 addps %xmm7, %xmm2 132 133 134 /* 135 * Now finally accumulate the high and low parts of the 136 * argument to log1p, H + L, with a final compensated summation. 137 */ 138 movaps %xmm2, %xmm6 139 andnps %xmm5, %xmm1 140 movaps %xmm4, %xmm7 141 /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ 142 addps %xmm11, %xmm6 143 maxps %xmm6, %xmm7 144 minps %xmm6, %xmm8 145 subps %xmm6, %xmm11 146 movaps %xmm7, %xmm10 147 addps %xmm8, %xmm10 148 addps %xmm11, %xmm2 149 subps %xmm10, %xmm7 150 psubd %xmm9, %xmm10 151 addps %xmm8, %xmm7 152 pand %xmm10, %xmm12 153 psrad $23, %xmm10 154 cvtdq2ps %xmm10, %xmm13 155 addps %xmm7, %xmm2 156 157 /* final reconstruction */ 158 pslld $23, %xmm10 159 paddd %xmm9, %xmm12 160 psubd %xmm10, %xmm14 161 162 /* polynomial evaluation */ 163 subps %xmm4, %xmm12 164 mulps %xmm14, %xmm2 165 movups ATANHF_DATA(sPoly+0)(%rip), %xmm7 166 addps %xmm12, %xmm2 167 mulps %xmm2, %xmm7 168 169 170 /* Finally, halve the result and reincorporate the sign */ 171 addps ATANHF_DATA(sPoly+16)(%rip), %xmm7 172 mulps %xmm2, %xmm7 173 addps ATANHF_DATA(sPoly+32)(%rip), %xmm7 174 mulps %xmm2, %xmm7 175 addps ATANHF_DATA(sPoly+48)(%rip), %xmm7 176 mulps %xmm2, %xmm7 177 addps ATANHF_DATA(sPoly+64)(%rip), %xmm7 178 mulps %xmm2, %xmm7 179 addps ATANHF_DATA(sPoly+80)(%rip), %xmm7 180 mulps %xmm2, %xmm7 181 addps ATANHF_DATA(sPoly+96)(%rip), %xmm7 182 mulps %xmm2, %xmm7 183 movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6 184 addps %xmm6, %xmm7 185 mulps %xmm2, %xmm7 186 mulps %xmm2, %xmm7 187 mulps ATANHF_DATA(sLn2)(%rip), %xmm13 188 /* We can build `sHalf` with `sPoly & sOne`. */ 189 andps %xmm4, %xmm6 190 orps %xmm1, %xmm3 191 xorps %xmm6, %xmm1 192 193 addps %xmm2, %xmm7 194 addps %xmm13, %xmm7 195 mulps %xmm7, %xmm1 196 197 /* Finish check of NaNs. */ 198 cmpleps %xmm0, %xmm4 199 movmskps %xmm4, %edx 200 cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0 201 202 andps %xmm0, %xmm3 203 andnps %xmm1, %xmm0 204 orps %xmm3, %xmm0 205 206 testl %edx, %edx 207 /* Go to special inputs processing branch. */ 208 jne L(SPECIAL_VALUES_BRANCH) 209 # LOE rbx rbp r12 r13 r14 r15 xmm0 210 /* No registers to restore on fast path. */ 211 ret 212 213 214 /* Cold case. edx has 1s where there was a special value that 215 needs to be handled by a atanhf call. Optimize for code size 216 more so than speed here. */ 217L(SPECIAL_VALUES_BRANCH): 218 # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 219 /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on 220 call entry will be 16-byte aligned. */ 221 subq $56, %rsp 222 cfi_def_cfa_offset(64) 223 movups %xmm0, 24(%rsp) 224 movups %xmm5, 40(%rsp) 225 226 /* Use rbx/rbp for callee save registers as they get short 227 encoding for many instructions (as compared with r12/r13). */ 228 movq %rbx, (%rsp) 229 cfi_offset(rbx, -64) 230 movq %rbp, 8(%rsp) 231 cfi_offset(rbp, -56) 232 /* edx has 1s where there was a special value that needs to be handled 233 by a tanhf call. */ 234 movl %edx, %ebx 235L(SPECIAL_VALUES_LOOP): 236 # LOE rbx rbp r12 r13 r14 r15 237 /* use rbp as index for special value that is saved across calls to 238 tanhf. We technically don't need a callee save register here as offset 239 to rsp is always [0, 12] so we can restore rsp by realigning to 64. 240 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions 241 in the loop. */ 242 xorl %ebp, %ebp 243 bsfl %ebx, %ebp 244 245 /* Scalar math fucntion call to process special input. */ 246 movss 40(%rsp, %rbp, 4), %xmm0 247 call atanhf@PLT 248 /* No good way to avoid the store-forwarding fault this will cause on 249 return. `lfence` avoids the SF fault but at greater cost as it 250 serialized stack/callee save restoration. */ 251 movss %xmm0, 24(%rsp, %rbp, 4) 252 253 leal -1(%rbx), %eax 254 andl %eax, %ebx 255 jnz L(SPECIAL_VALUES_LOOP) 256 # LOE r12 r13 r14 r15 257 /* All results have been written to 24(%rsp). */ 258 movups 24(%rsp), %xmm0 259 movq (%rsp), %rbx 260 cfi_restore(rbx) 261 movq 8(%rsp), %rbp 262 cfi_restore(rbp) 263 addq $56, %rsp 264 cfi_def_cfa_offset(8) 265 ret 266END(_ZGVbN4v_atanhf_sse4) 267 268 .section .rodata, "a" 269 .align 16 270 271#ifdef __svml_satanh_data_internal_typedef 272typedef unsigned int VUINT32; 273typedef struct{ 274 __declspec(align(16)) VUINT32 sOne[4][1]; 275 __declspec(align(16)) VUINT32 SgnMask[4][1]; 276 __declspec(align(16)) VUINT32 sTopMask12[4][1]; 277 __declspec(align(16)) VUINT32 iBrkValue[4][1]; 278 __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; 279 __declspec(align(16)) VUINT32 sPoly[8][4][1]; 280 __declspec(align(16)) VUINT32 sLn2[4][1]; 281 __declspec(align(16)) VUINT32 TinyRange[4][1]; 282} __svml_satanh_data_internal; 283#endif 284 285__svml_satanh_data_internal: 286 /* sOne = SP 1.0 */ 287 .align 16 288 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 289 /* SgnMask */ 290 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 291 /* sTopMask12 */ 292 .align 16 293 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 294 /* iBrkValue = SP 2/3 */ 295 .align 16 296 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab 297 /* iOffExpoMask = SP significand mask ==*/ 298 .align 16 299 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff 300 301 /* sPoly[] = SP polynomial */ 302 .align 16 303 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ 304 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ 305 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ 306 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ 307 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ 308 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ 309 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ 310 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ 311 312 /* sLn2 = SP ln(2) */ 313 .align 16 314 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 315 /* TinyRange */ 316 .align 16 317 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 318 .align 16 319 .type __svml_satanh_data_internal, @object 320 .size __svml_satanh_data_internal, .-__svml_satanh_data_internal 321