1/* Function tanhf vectorized with AVX2. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * NOTE: Since the hyperbolic tangent function is odd 23 * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute 24 * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|) 25 * 26 * We use a table lookup method to compute tanh(|x|). 27 * The basic idea is to split the input range into a number of subintervals 28 * and to approximate tanh(.) with a polynomial on each of them. 29 * 30 * IEEE SPECIAL CONDITIONS: 31 * x = [+, -]0, r = [+, -]0 32 * x = +Inf, r = +1 33 * x = -Inf, r = -1 34 * x = QNaN, r = QNaN 35 * x = SNaN, r = QNaN 36 * 37 * 38 * ALGORITHM DETAILS 39 * We handle special values in a callout function, aside from main path 40 * computations. "Special" for this algorithm are: 41 * INF, NAN, |x| > HUGE_THRESHOLD 42 * 43 * 44 * Main path computations are organized as follows: 45 * Actually we split the interval [0, SATURATION_THRESHOLD) 46 * into a number of subintervals. On each subinterval we approximate tanh(.) 47 * with a minimax polynomial of pre-defined degree. Polynomial coefficients 48 * are computed beforehand and stored in table. We also use 49 * 50 * y := |x| + B, 51 * 52 * here B depends on subinterval and is used to make argument 53 * closer to zero. 54 * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD], 55 * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to 56 * preserve main path computation logic but return 1.0 for all arguments. 57 * 58 * Hence reconstruction looks as follows: 59 * we extract proper polynomial and range reduction coefficients 60 * (Pj and B), corresponding to subinterval, to which |x| belongs, 61 * and return 62 * 63 * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n) 64 * 65 * NOTE: we use multiprecision technique to multiply and sum the first 66 * K terms of the polynomial. So Pj, j = 0..K are stored in 67 * table each as a pair of target precision numbers (Pj and PLj) to 68 * achieve wider than target precision. 69 * 70 * 71 */ 72 73#include <sysdep.h> 74 75/* tanhf data tables for avx2 and sse4 implementatins defined here. 76 */ 77#include "svml_s_tanhf_rodata.S" 78 79 .section .text.avx2, "ax", @progbits 80ENTRY(_ZGVdN8v_tanhf_avx2) 81 /* Here huge arguments, INF and NaNs are filtered out to callout. */ 82 vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4 83 vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2 84 85 /* Selection of arguments between [0, 0x04280000] into ymm2. */ 86 vpxor %ymm3, %ymm3, %ymm3 87 vpmaxsd %ymm3, %ymm2, %ymm2 88 vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2 89 90 /* 91 * small table specific variables * 92 * Constant loading 93 */ 94 vpsrld $14, %ymm2, %ymm1 95 96 /* We are splitting xmm1 into 8 GPRs. This may be faster to do with 97 store/load as we can take advantage of store-forwarding. */ 98 vmovq %xmm1, %r8 99 /* We have eliminated all negative values for ymm1 so no need to sign 100 extend. */ 101 movl %r8d, %r9d 102 shrq $32, %r8 103 104 /* Store base of lookup table in rax. */ 105 leaq TANHF_DATA(_lookupTable)(%rip), %rax 106 107 /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128 108 with memory operand. This helps alleviate bottleneck on p5. */ 109 vmovupd 16(%r9, %rax), %xmm5 110 111 vpextrq $1, %xmm1, %rsi 112 movl %esi, %edi 113 shrq $32, %rsi 114 115 vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5 116 117 vextracti128 $1, %ymm1, %xmm2 118 vmovq %xmm2, %rdx 119 movl %edx, %ecx 120 shrq $32, %rdx 121 122 vmovupd (%rcx, %rax), %xmm6 123 124 vpextrq $1, %xmm2, %r10 125 movl %r10d, %r11d 126 shrq $32, %r10 127 128 vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6 129 130 vmovupd 16(%r8, %rax), %xmm1 131 vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1 132 vmovupd (%rdx, %rax), %xmm3 133 vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3 134 135 vunpcklpd %ymm3, %ymm6, %ymm7 136 vunpckhpd %ymm3, %ymm6, %ymm6 137 138 vunpcklpd %ymm1, %ymm5, %ymm3 139 vunpckhpd %ymm1, %ymm5, %ymm1 140 141 vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11 142 /* Store special cases in ymm15. */ 143 vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15 144 145 vandps %ymm11, %ymm0, %ymm4 146 147 vcvtps2pd %xmm4, %ymm5 148 149 vextractf128 $1, %ymm4, %xmm4 150 vcvtps2pd %xmm4, %ymm4 151 152 vmovupd 16(%rcx, %rax), %xmm2 153 vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2 154 155 vfmadd213pd %ymm3, %ymm5, %ymm1 156 157 vmovupd 16(%rdx, %rax), %xmm3 158 vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3 159 160 vunpcklpd %ymm3, %ymm2, %ymm10 161 vunpckhpd %ymm3, %ymm2, %ymm2 162 163 vfmadd213pd %ymm10, %ymm4, %ymm2 164 vfmadd213pd %ymm6, %ymm4, %ymm2 165 vfmadd213pd %ymm7, %ymm4, %ymm2 166 vcvtpd2ps %ymm2, %xmm2 167 168 vmovupd (%r9, %rax), %xmm7 169 vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7 170 171 vmovupd (%r8, %rax), %xmm3 172 vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3 173 174 vunpckhpd %ymm3, %ymm7, %ymm4 175 vunpcklpd %ymm3, %ymm7, %ymm7 176 177 vfmadd213pd %ymm4, %ymm5, %ymm1 178 vfmadd213pd %ymm7, %ymm5, %ymm1 179 180 181 vcvtpd2ps %ymm1, %xmm1 182 vinsertf128 $1, %xmm2, %ymm1, %ymm1 183 184 vmovmskps %ymm15, %edx 185 vandnps %ymm0, %ymm11, %ymm2 186 testl %edx, %edx 187 /* Go to special inputs processing branch */ 188 jne L(SPECIAL_VALUES_BRANCH) 189 # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2 190 /* Wait until after branch of write over ymm0. */ 191 vorps %ymm2, %ymm1, %ymm0 192 /* No stack restoration on the fastpath. */ 193 ret 194 195 196 /* Cold case. edx has 1s where there was a special value that 197 needs to be handled by a tanhf call. Optimize for code size 198 more so than speed here. */ 199L(SPECIAL_VALUES_BRANCH): 200 # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2 201 /* Use r13 to save/restore the stack. This allows us to use rbp as 202 callee save register saving code size. */ 203 pushq %r13 204 cfi_adjust_cfa_offset(8) 205 cfi_offset(r13, -16) 206 /* Need to callee save registers to preserve state across tanhf calls. 207 */ 208 pushq %rbx 209 cfi_adjust_cfa_offset(8) 210 cfi_offset(rbx, -24) 211 pushq %rbp 212 cfi_adjust_cfa_offset(8) 213 cfi_offset(rbp, -32) 214 movq %rsp, %r13 215 cfi_def_cfa_register(r13) 216 217 /* Align stack and make room for 2x ymm vectors. */ 218 andq $-32, %rsp 219 addq $-64, %rsp 220 221 /* Save all already computed inputs. */ 222 vorps %ymm2, %ymm1, %ymm1 223 vmovaps %ymm1, (%rsp) 224 /* Save original input (ymm0 unchanged up to this point). */ 225 vmovaps %ymm0, 32(%rsp) 226 227 vzeroupper 228 229 /* edx has 1s where there was a special value that needs to be handled 230 by a tanhf call. */ 231 movl %edx, %ebx 232L(SPECIAL_VALUES_LOOP): 233 # LOE rbx rbp r12 r13 r14 r15 234 /* use rbp as index for special value that is saved across calls to 235 tanhf. We technically don't need a callee save register here as offset 236 to rsp is always [0, 28] so we can restore rsp by realigning to 64. 237 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions 238 in the loop. Realigning also costs more code size. */ 239 xorl %ebp, %ebp 240 tzcntl %ebx, %ebp 241 242 /* Scalar math function call to process special input. */ 243 vmovss 32(%rsp, %rbp, 4), %xmm0 244 call tanhf@PLT 245 246 /* No good way to avoid the store-forwarding fault this will cause on 247 return. `lfence` avoids the SF fault but at greater cost as it 248 serialized stack/callee save restoration. */ 249 vmovss %xmm0, (%rsp, %rbp, 4) 250 251 blsrl %ebx, %ebx 252 jnz L(SPECIAL_VALUES_LOOP) 253 # LOE r12 r13 r14 r15 254 255 256 /* All results have been written to (%rsp). */ 257 vmovups (%rsp), %ymm0 258 /* Restore rsp. */ 259 movq %r13, %rsp 260 cfi_def_cfa_register(rsp) 261 /* Restore callee save registers. */ 262 popq %rbp 263 cfi_adjust_cfa_offset(-8) 264 cfi_restore(rbp) 265 popq %rbx 266 cfi_adjust_cfa_offset(-8) 267 cfi_restore(rbp) 268 popq %r13 269 cfi_adjust_cfa_offset(-8) 270 cfi_restore(r13) 271 ret 272END(_ZGVdN8v_tanhf_avx2) 273