1/* Function coshf vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * Compute cosh(x) as (exp(x)+exp(-x))/2, 23 * where exp is calculated as 24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r) 25 * 26 * Special cases: 27 * 28 * cosh(NaN) = quiet NaN, and raise invalid exception 29 * cosh(INF) = that INF 30 * cosh(0) = 1 31 * cosh(x) overflows for big x and returns MAXLOG+log(2) 32 * 33 */ 34 35/* Offsets for data table __svml_scosh_data_internal 36 */ 37#define _sExp_tbl_PH 0 38#define _sExp_tbl_NH 128 39#define _sShifter_UISA 256 40#define _iDomainRange_UISA 320 41#define _sPC1_UISA 384 42#define _sPC2_UISA 448 43#define _sPC3_UISA 512 44#define _sInvLn2 576 45#define _sLn2hi 640 46#define _sLn2lo 704 47#define _sSign 768 48#define _iExpMask 832 49#define _sShifter 896 50#define _iDomainRange 960 51#define _sPC1 1024 52#define _sPC2 1088 53#define _sPC3 1152 54 55#include <sysdep.h> 56 57 .section .text.exex512, "ax", @progbits 58ENTRY(_ZGVeN16v_coshf_skx) 59 pushq %rbp 60 cfi_def_cfa_offset(16) 61 movq %rsp, %rbp 62 cfi_def_cfa(6, 16) 63 cfi_offset(6, -16) 64 andq $-64, %rsp 65 subq $192, %rsp 66 vmovups _sSign+__svml_scosh_data_internal(%rip), %zmm4 67 vmovups _sShifter_UISA+__svml_scosh_data_internal(%rip), %zmm6 68 69 /* 70 * Load argument 71 * dM = x/log(2) + RShifter 72 */ 73 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %zmm10 74 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %zmm7 75 vmovups _sLn2lo+__svml_scosh_data_internal(%rip), %zmm9 76 77 /* */ 78 vmovups _sPC3_UISA+__svml_scosh_data_internal(%rip), %zmm2 79 80 /* x^2 */ 81 vmovups _sPC2_UISA+__svml_scosh_data_internal(%rip), %zmm3 82 83 /* G1, G2 2^N, 2^(-N) */ 84 vmovups __svml_scosh_data_internal(%rip), %zmm12 85 vmovups _sExp_tbl_NH+__svml_scosh_data_internal(%rip), %zmm13 86 87 /* 88 * Implementation 89 * Abs argument 90 */ 91 vandnps %zmm0, %zmm4, %zmm1 92 93 /* Check for overflow\underflow */ 94 vpternlogd $255, %zmm5, %zmm5, %zmm5 95 vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm10 96 vpcmpd $1, _iDomainRange_UISA+__svml_scosh_data_internal(%rip), %zmm1, %k1 97 98 /* iM now is an EXP(2^N) */ 99 vpslld $18, %zmm10, %zmm11 100 101 /* 102 * R 103 * sN = sM - RShifter 104 */ 105 vsubps {rn-sae}, %zmm6, %zmm10, %zmm8 106 vpermt2ps _sExp_tbl_PH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm12 107 vpermt2ps _sExp_tbl_NH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm13 108 vpandnd %zmm1, %zmm1, %zmm5{%k1} 109 110 /* sR = sX - sN*Log2_hi */ 111 vfnmadd231ps {rn-sae}, %zmm7, %zmm8, %zmm1 112 vptestmd %zmm5, %zmm5, %k0 113 114 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */ 115 vfnmadd231ps {rn-sae}, %zmm9, %zmm8, %zmm1 116 kmovw %k0, %edx 117 vmulps {rn-sae}, %zmm1, %zmm1, %zmm4 118 vmulps {rn-sae}, %zmm4, %zmm2, %zmm2 119 120 /* sSinh_r = r + r*(r^2*(a3)) */ 121 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm2 122 123 /* sOut = r^2*(a2) */ 124 vmulps {rn-sae}, %zmm4, %zmm3, %zmm1 125 vpandd _iExpMask+__svml_scosh_data_internal(%rip), %zmm11, %zmm14 126 vpaddd %zmm14, %zmm12, %zmm15 127 vpsubd %zmm14, %zmm13, %zmm10 128 129 /* sG2 = 2^N*Th + 2^(-N)*T_h */ 130 vaddps {rn-sae}, %zmm10, %zmm15, %zmm5 131 132 /* sG1 = 2^N*Th - 2^(-N)*T_h */ 133 vsubps {rn-sae}, %zmm10, %zmm15, %zmm6 134 135 /* res = sG1*(r + r*(r^2*(a3))) + sG2*(1+r^2*(a2)) */ 136 vfmadd213ps {rn-sae}, %zmm5, %zmm5, %zmm1 137 vfmadd213ps {rn-sae}, %zmm1, %zmm2, %zmm6 138 testl %edx, %edx 139 140 /* Go to special inputs processing branch */ 141 jne L(SPECIAL_VALUES_BRANCH) 142 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6 143 144 /* Restore registers 145 * and exit the function 146 */ 147 148L(EXIT): 149 vmovaps %zmm6, %zmm0 150 movq %rbp, %rsp 151 popq %rbp 152 cfi_def_cfa(7, 8) 153 cfi_restore(6) 154 ret 155 cfi_def_cfa(6, 16) 156 cfi_offset(6, -16) 157 158 /* Branch to process 159 * special inputs 160 */ 161 162L(SPECIAL_VALUES_BRANCH): 163 vmovups %zmm0, 64(%rsp) 164 vmovups %zmm6, 128(%rsp) 165 # LOE rbx r12 r13 r14 r15 edx zmm6 166 167 xorl %eax, %eax 168 # LOE rbx r12 r13 r14 r15 eax edx 169 170 vzeroupper 171 movq %r12, 16(%rsp) 172 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 173 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 174 movl %eax, %r12d 175 movq %r13, 8(%rsp) 176 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 177 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 178 movl %edx, %r13d 179 movq %r14, (%rsp) 180 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 181 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 182 # LOE rbx r15 r12d r13d 183 184 /* Range mask 185 * bits check 186 */ 187 188L(RANGEMASK_CHECK): 189 btl %r12d, %r13d 190 191 /* Call scalar math function */ 192 jc L(SCALAR_MATH_CALL) 193 # LOE rbx r15 r12d r13d 194 195 /* Special inputs 196 * processing loop 197 */ 198 199L(SPECIAL_VALUES_LOOP): 200 incl %r12d 201 cmpl $16, %r12d 202 203 /* Check bits in range mask */ 204 jl L(RANGEMASK_CHECK) 205 # LOE rbx r15 r12d r13d 206 207 movq 16(%rsp), %r12 208 cfi_restore(12) 209 movq 8(%rsp), %r13 210 cfi_restore(13) 211 movq (%rsp), %r14 212 cfi_restore(14) 213 vmovups 128(%rsp), %zmm6 214 215 /* Go to exit */ 216 jmp L(EXIT) 217 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 218 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 219 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 220 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 221 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 222 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 223 # LOE rbx r12 r13 r14 r15 zmm6 224 225 /* Scalar math fucntion call 226 * to process special input 227 */ 228 229L(SCALAR_MATH_CALL): 230 movl %r12d, %r14d 231 vmovss 64(%rsp, %r14, 4), %xmm0 232 call coshf@PLT 233 # LOE rbx r14 r15 r12d r13d xmm0 234 235 vmovss %xmm0, 128(%rsp, %r14, 4) 236 237 /* Process special inputs in loop */ 238 jmp L(SPECIAL_VALUES_LOOP) 239 # LOE rbx r15 r12d r13d 240END(_ZGVeN16v_coshf_skx) 241 242 .section .rodata, "a" 243 .align 64 244 245#ifdef __svml_scosh_data_internal_typedef 246typedef unsigned int VUINT32; 247typedef struct { 248 __declspec(align(64)) VUINT32 _sExp_tbl_PH[32][1]; 249 __declspec(align(64)) VUINT32 _sExp_tbl_NH[32][1]; 250 __declspec(align(64)) VUINT32 _sShifter_UISA[16][1]; 251 __declspec(align(64)) VUINT32 _iDomainRange_UISA[16][1]; 252 __declspec(align(64)) VUINT32 _sPC1_UISA[16][1]; 253 __declspec(align(64)) VUINT32 _sPC2_UISA[16][1]; 254 __declspec(align(64)) VUINT32 _sPC3_UISA[16][1]; 255 __declspec(align(64)) VUINT32 _sInvLn2[16][1]; 256 __declspec(align(64)) VUINT32 _sLn2hi[16][1]; 257 __declspec(align(64)) VUINT32 _sLn2lo[16][1]; 258 __declspec(align(64)) VUINT32 _sSign[16][1]; 259 __declspec(align(64)) VUINT32 _iExpMask[16][1]; 260 __declspec(align(64)) VUINT32 _sShifter[16][1]; 261 __declspec(align(64)) VUINT32 _iDomainRange[16][1]; 262 __declspec(align(64)) VUINT32 _sPC1[16][1]; 263 __declspec(align(64)) VUINT32 _sPC2[16][1]; 264 __declspec(align(64)) VUINT32 _sPC3[16][1]; 265} __svml_scosh_data_internal; 266#endif 267__svml_scosh_data_internal: 268 /* _sExp_tbl_PH 2^(i/32-1), i=0..31 */ 269 .long 0x3f000000, 0x3f02cd87, 0x3f05aac3, 0x3f08980f 270 .long 0x3f0b95c2, 0x3f0ea43a, 0x3f11c3d3, 0x3f14f4f0 271 .long 0x3f1837f0, 0x3f1b8d3a, 0x3f1ef532, 0x3f227043 272 .long 0x3f25fed7, 0x3f29a15b, 0x3f2d583f, 0x3f3123f6 273 .long 0x3f3504f3, 0x3f38fbaf, 0x3f3d08a4, 0x3f412c4d 274 .long 0x3f45672a, 0x3f49b9be, 0x3f4e248c, 0x3f52a81e 275 .long 0x3f5744fd, 0x3f5bfbb8, 0x3f60ccdf, 0x3f65b907 276 .long 0x3f6ac0c7, 0x3f6fe4ba, 0x3f75257d, 0x3f7a83b3 277 /* _sExp_tbl_NH 2^(-i/32-1), i=0..31 */ 278 .align 64 279 .long 0x3f000000, 0x3efa83b3, 0x3ef5257d, 0x3eefe4ba 280 .long 0x3eeac0c7, 0x3ee5b907, 0x3ee0ccdf, 0x3edbfbb8 281 .long 0x3ed744fd, 0x3ed2a81e, 0x3ece248c, 0x3ec9b9be 282 .long 0x3ec5672a, 0x3ec12c4d, 0x3ebd08a4, 0x3eb8fbaf 283 .long 0x3eb504f3, 0x3eb123f6, 0x3ead583f, 0x3ea9a15b 284 .long 0x3ea5fed7, 0x3ea27043, 0x3e9ef532, 0x3e9b8d3a 285 .long 0x3e9837f0, 0x3e94f4f0, 0x3e91c3d3, 0x3e8ea43a 286 .long 0x3e8b95c2, 0x3e88980f, 0x3e85aac3, 0x3e82cd87 287 .align 64 288 .long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000 /* 1.5*2^18 _sShifter_UISA */ 289 .align 64 290 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange_UISA */ 291 .align 64 292 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1_UISA=1 */ 293 .align 64 294 .long 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f /* _sPC2_UISA */ 295 .align 64 296 .long 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd /* _sPC3_UISA */ 297 .align 64 298 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0 299 .align 64 300 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */ 301 .align 64 302 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */ 303 .align 64 304 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */ 305 .align 64 306 .long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _iExpMask */ 307 .align 64 308 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */ 309 .align 64 310 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */ 311 .align 64 312 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */ 313 .align 64 314 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */ 315 .align 64 316 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */ 317 .align 64 318 .type __svml_scosh_data_internal, @object 319 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal 320