1/* Function sinhf vectorized with SSE4. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * Compute sinh(x) as (exp(x)-exp(-x))/2, 23 * where exp is calculated as 24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r) 25 * 26 * Special cases: 27 * 28 * sinh(NaN) = quiet NaN, and raise invalid exception 29 * sinh(INF) = that INF 30 * sinh(x) = x for subnormals 31 * sinh(x) overflows for big x and returns MAXLOG+log(2) 32 * 33 */ 34 35/* Offsets for data table __svml_ssinh_data_internal 36 */ 37#define _sInvLn2 0 38#define _sLn2hi 16 39#define _sLn2lo 32 40#define _sSign 48 41#define _sShifter 64 42#define _iDomainRange 80 43#define _sPC1 96 44#define _sPC2 112 45#define _sPC3 128 46#define _sPC4 144 47#define _sPC5 160 48#define _sPC6 176 49#define _iHalf 192 50 51#include <sysdep.h> 52 53 .section .text.sse4, "ax", @progbits 54ENTRY(_ZGVbN4v_sinhf_sse4) 55 subq $72, %rsp 56 cfi_def_cfa_offset(80) 57 58 /* 59 * Implementation 60 * Abs argument 61 */ 62 movups _sSign+__svml_ssinh_data_internal(%rip), %xmm14 63 andps %xmm0, %xmm14 64 movaps %xmm14, %xmm10 65 66 /* 67 * Load argument 68 * dM = x/log(2) + RShifter 69 */ 70 movups _sInvLn2+__svml_ssinh_data_internal(%rip), %xmm7 71 pxor %xmm0, %xmm10 72 mulps %xmm10, %xmm7 73 74 /* 75 * Check for overflow\underflow 76 * MORE faster than GE? 77 */ 78 movaps %xmm10, %xmm1 79 movups _sShifter+__svml_ssinh_data_internal(%rip), %xmm2 80 81 /* sR = sX - sN*Log2_hi */ 82 movups _sLn2hi+__svml_ssinh_data_internal(%rip), %xmm3 83 addps %xmm2, %xmm7 84 85 /* 86 * R 87 * sN = sM - RShifter 88 */ 89 movaps %xmm7, %xmm4 90 91 /* 92 * G1, G2 2^N, 2^(-N) 93 * iM now is an EXP(2^N) 94 */ 95 pslld $23, %xmm7 96 97 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */ 98 movups _sLn2lo+__svml_ssinh_data_internal(%rip), %xmm5 99 subps %xmm2, %xmm4 100 mulps %xmm4, %xmm3 101 mulps %xmm4, %xmm5 102 subps %xmm3, %xmm10 103 104 /* 105 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) .... 106 * sSinh_r = (a3+r^2*a5) 107 */ 108 movups _sPC5+__svml_ssinh_data_internal(%rip), %xmm8 109 subps %xmm5, %xmm10 110 111 /* sR2 = sR^2 */ 112 movaps %xmm10, %xmm12 113 mulps %xmm10, %xmm12 114 115 /* 116 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) 117 * sOut = (a4 +a6*sR2) 118 */ 119 movups _sPC6+__svml_ssinh_data_internal(%rip), %xmm9 120 mulps %xmm12, %xmm8 121 mulps %xmm12, %xmm9 122 addps _sPC3+__svml_ssinh_data_internal(%rip), %xmm8 123 addps _sPC4+__svml_ssinh_data_internal(%rip), %xmm9 124 125 /* sSinh_r = r^2*(a3+r^2*a5) */ 126 mulps %xmm12, %xmm8 127 128 /* sOut = a2+sR2*(a4+a6*sR2) */ 129 mulps %xmm12, %xmm9 130 131 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */ 132 mulps %xmm10, %xmm8 133 addps _sPC2+__svml_ssinh_data_internal(%rip), %xmm9 134 addps %xmm8, %xmm10 135 136 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */ 137 mulps %xmm9, %xmm12 138 movdqu _iHalf+__svml_ssinh_data_internal(%rip), %xmm6 139 movdqa %xmm6, %xmm13 140 psubd %xmm7, %xmm6 141 paddd %xmm7, %xmm13 142 143 /* sG1 = 2^(N-1)+2^(-N-1) */ 144 movdqa %xmm13, %xmm11 145 146 /* sG2 = 2^(N-1)-2^(-N-1) */ 147 subps %xmm6, %xmm13 148 addps %xmm6, %xmm11 149 150 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */ 151 mulps %xmm13, %xmm12 152 153 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */ 154 mulps %xmm10, %xmm11 155 pcmpgtd _iDomainRange+__svml_ssinh_data_internal(%rip), %xmm1 156 addps %xmm11, %xmm12 157 movmskps %xmm1, %edx 158 159 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */ 160 addps %xmm12, %xmm13 161 162 /* Ret H */ 163 orps %xmm13, %xmm14 164 testl %edx, %edx 165 166 /* Go to special inputs processing branch */ 167 jne L(SPECIAL_VALUES_BRANCH) 168 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm14 169 170 /* Restore registers 171 * and exit the function 172 */ 173 174L(EXIT): 175 movaps %xmm14, %xmm0 176 addq $72, %rsp 177 cfi_def_cfa_offset(8) 178 ret 179 cfi_def_cfa_offset(80) 180 181 /* Branch to process 182 * special inputs 183 */ 184 185L(SPECIAL_VALUES_BRANCH): 186 movups %xmm0, 32(%rsp) 187 movups %xmm14, 48(%rsp) 188 # LOE rbx rbp r12 r13 r14 r15 edx 189 190 xorl %eax, %eax 191 movq %r12, 16(%rsp) 192 cfi_offset(12, -64) 193 movl %eax, %r12d 194 movq %r13, 8(%rsp) 195 cfi_offset(13, -72) 196 movl %edx, %r13d 197 movq %r14, (%rsp) 198 cfi_offset(14, -80) 199 # LOE rbx rbp r15 r12d r13d 200 201 /* Range mask 202 * bits check 203 */ 204 205L(RANGEMASK_CHECK): 206 btl %r12d, %r13d 207 208 /* Call scalar math function */ 209 jc L(SCALAR_MATH_CALL) 210 # LOE rbx rbp r15 r12d r13d 211 212 /* Special inputs 213 * processing loop 214 */ 215 216L(SPECIAL_VALUES_LOOP): 217 incl %r12d 218 cmpl $4, %r12d 219 220 /* Check bits in range mask */ 221 jl L(RANGEMASK_CHECK) 222 # LOE rbx rbp r15 r12d r13d 223 224 movq 16(%rsp), %r12 225 cfi_restore(12) 226 movq 8(%rsp), %r13 227 cfi_restore(13) 228 movq (%rsp), %r14 229 cfi_restore(14) 230 movups 48(%rsp), %xmm14 231 232 /* Go to exit */ 233 jmp L(EXIT) 234 cfi_offset(12, -64) 235 cfi_offset(13, -72) 236 cfi_offset(14, -80) 237 # LOE rbx rbp r12 r13 r14 r15 xmm14 238 239 /* Scalar math fucntion call 240 * to process special input 241 */ 242 243L(SCALAR_MATH_CALL): 244 movl %r12d, %r14d 245 movss 32(%rsp, %r14, 4), %xmm0 246 call sinhf@PLT 247 # LOE rbx rbp r14 r15 r12d r13d xmm0 248 249 movss %xmm0, 48(%rsp, %r14, 4) 250 251 /* Process special inputs in loop */ 252 jmp L(SPECIAL_VALUES_LOOP) 253 # LOE rbx rbp r15 r12d r13d 254END(_ZGVbN4v_sinhf_sse4) 255 256 .section .rodata, "a" 257 .align 16 258 259#ifdef __svml_ssinh_data_internal_typedef 260typedef unsigned int VUINT32; 261typedef struct { 262 __declspec(align(16)) VUINT32 _sInvLn2[4][1]; 263 __declspec(align(16)) VUINT32 _sLn2hi[4][1]; 264 __declspec(align(16)) VUINT32 _sLn2lo[4][1]; 265 __declspec(align(16)) VUINT32 _sSign[4][1]; 266 __declspec(align(16)) VUINT32 _sShifter[4][1]; 267 __declspec(align(16)) VUINT32 _iDomainRange[4][1]; 268 __declspec(align(16)) VUINT32 _sPC1[4][1]; 269 __declspec(align(16)) VUINT32 _sPC2[4][1]; 270 __declspec(align(16)) VUINT32 _sPC3[4][1]; 271 __declspec(align(16)) VUINT32 _sPC4[4][1]; 272 __declspec(align(16)) VUINT32 _sPC5[4][1]; 273 __declspec(align(16)) VUINT32 _sPC6[4][1]; 274 __declspec(align(16)) VUINT32 _iHalf[4][1]; 275} __svml_ssinh_data_internal; 276#endif 277__svml_ssinh_data_internal: 278 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0 279 .align 16 280 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */ 281 .align 16 282 .long 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4 /* _sLn2lo */ 283 .align 16 284 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */ 285 .align 16 286 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */ 287 .align 16 288 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */ 289 .align 16 290 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */ 291 .align 16 292 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */ 293 .align 16 294 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */ 295 .align 16 296 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */ 297 .align 16 298 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */ 299 .align 16 300 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */ 301 // Integer constants 302 .align 16 303 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */ 304 .align 16 305 .type __svml_ssinh_data_internal, @object 306 .size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal 307