1/* Function log1pf vectorized with AVX-512. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2) 23 * Get short reciprocal approximation Rcp ~ 1/xh 24 * R = (Rcp*xh - 1.0) + Rcp*xl 25 * log1p(x) = k*log(2.0) - log(Rcp) + poly(R) 26 * log(Rcp) is tabulated 27 * 28 * 29 */ 30 31/* Offsets for data table __svml_slog1p_data_internal 32 */ 33#define SgnMask 0 34#define sOne 64 35#define sPoly_1 128 36#define sPoly_2 192 37#define sPoly_3 256 38#define sPoly_4 320 39#define sPoly_5 384 40#define sPoly_6 448 41#define sPoly_7 512 42#define sPoly_8 576 43#define iHiDelta 640 44#define iLoRange 704 45#define iBrkValue 768 46#define iOffExpoMask 832 47#define sLn2 896 48 49#include <sysdep.h> 50 51 .section .text.exex512, "ax", @progbits 52ENTRY(_ZGVeN16v_log1pf_skx) 53 pushq %rbp 54 cfi_def_cfa_offset(16) 55 movq %rsp, %rbp 56 cfi_def_cfa(6, 16) 57 cfi_offset(6, -16) 58 andq $-64, %rsp 59 subq $192, %rsp 60 vmovups sOne+__svml_slog1p_data_internal(%rip), %zmm2 61 62 /* reduction: compute r, n */ 63 vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12 64 vmovups SgnMask+__svml_slog1p_data_internal(%rip), %zmm4 65 vmovaps %zmm0, %zmm3 66 67 /* compute 1+x as high, low parts */ 68 vmaxps {sae}, %zmm3, %zmm2, %zmm5 69 vminps {sae}, %zmm3, %zmm2, %zmm7 70 vandnps %zmm3, %zmm4, %zmm1 71 vpternlogd $255, %zmm4, %zmm4, %zmm4 72 vaddps {rn-sae}, %zmm7, %zmm5, %zmm9 73 vpsubd %zmm12, %zmm9, %zmm10 74 vsubps {rn-sae}, %zmm9, %zmm5, %zmm6 75 76 /* check argument value ranges */ 77 vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8 78 vpsrad $23, %zmm10, %zmm13 79 vmovups sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9 80 vpcmpd $5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1 81 vpslld $23, %zmm13, %zmm14 82 vaddps {rn-sae}, %zmm7, %zmm6, %zmm15 83 vcvtdq2ps {rn-sae}, %zmm13, %zmm0 84 vpsubd %zmm14, %zmm2, %zmm13 85 vmovups sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7 86 vmovups sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14 87 vmulps {rn-sae}, %zmm13, %zmm15, %zmm6 88 vpandd iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11 89 vpaddd %zmm12, %zmm11, %zmm5 90 vmovups sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10 91 vmovups sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11 92 vmovups sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12 93 94 /* polynomial evaluation */ 95 vsubps {rn-sae}, %zmm2, %zmm5, %zmm2 96 vaddps {rn-sae}, %zmm6, %zmm2, %zmm15 97 vmovups sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2 98 vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2 99 vpandnd %zmm8, %zmm8, %zmm4{%k1} 100 vmovups sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8 101 102 /* combine and get argument value range mask */ 103 vptestmd %zmm4, %zmm4, %k0 104 vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2 105 kmovw %k0, %edx 106 vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2 107 vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2 108 vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2 109 vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2 110 vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2 111 vmulps {rn-sae}, %zmm15, %zmm2, %zmm4 112 vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4 113 114 /* final reconstruction */ 115 vmovups sLn2+__svml_slog1p_data_internal(%rip), %zmm15 116 vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0 117 vorps %zmm1, %zmm0, %zmm0 118 testl %edx, %edx 119 120 /* Go to special inputs processing branch */ 121 jne L(SPECIAL_VALUES_BRANCH) 122 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3 123 124 /* Restore registers 125 * and exit the function 126 */ 127 128L(EXIT): 129 movq %rbp, %rsp 130 popq %rbp 131 cfi_def_cfa(7, 8) 132 cfi_restore(6) 133 ret 134 cfi_def_cfa(6, 16) 135 cfi_offset(6, -16) 136 137 /* Branch to process 138 * special inputs 139 */ 140 141L(SPECIAL_VALUES_BRANCH): 142 vmovups %zmm3, 64(%rsp) 143 vmovups %zmm0, 128(%rsp) 144 # LOE rbx r12 r13 r14 r15 edx zmm0 145 146 xorl %eax, %eax 147 # LOE rbx r12 r13 r14 r15 eax edx 148 149 vzeroupper 150 movq %r12, 16(%rsp) 151 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 152 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 153 movl %eax, %r12d 154 movq %r13, 8(%rsp) 155 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 156 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 157 movl %edx, %r13d 158 movq %r14, (%rsp) 159 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 160 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 161 # LOE rbx r15 r12d r13d 162 163 /* Range mask 164 * bits check 165 */ 166 167L(RANGEMASK_CHECK): 168 btl %r12d, %r13d 169 170 /* Call scalar math function */ 171 jc L(SCALAR_MATH_CALL) 172 # LOE rbx r15 r12d r13d 173 174 /* Special inputs 175 * processing loop 176 */ 177 178L(SPECIAL_VALUES_LOOP): 179 incl %r12d 180 cmpl $16, %r12d 181 182 /* Check bits in range mask */ 183 jl L(RANGEMASK_CHECK) 184 # LOE rbx r15 r12d r13d 185 186 movq 16(%rsp), %r12 187 cfi_restore(12) 188 movq 8(%rsp), %r13 189 cfi_restore(13) 190 movq (%rsp), %r14 191 cfi_restore(14) 192 vmovups 128(%rsp), %zmm0 193 194 /* Go to exit */ 195 jmp L(EXIT) 196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ 197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 198 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ 199 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 200 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ 201 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 202 # LOE rbx r12 r13 r14 r15 zmm0 203 204 /* Scalar math fucntion call 205 * to process special input 206 */ 207 208L(SCALAR_MATH_CALL): 209 movl %r12d, %r14d 210 vmovss 64(%rsp, %r14, 4), %xmm0 211 call log1pf@PLT 212 # LOE rbx r14 r15 r12d r13d xmm0 213 214 vmovss %xmm0, 128(%rsp, %r14, 4) 215 216 /* Process special inputs in loop */ 217 jmp L(SPECIAL_VALUES_LOOP) 218 # LOE rbx r15 r12d r13d 219END(_ZGVeN16v_log1pf_skx) 220 221 .section .rodata, "a" 222 .align 64 223 224#ifdef __svml_slog1p_data_internal_typedef 225typedef unsigned int VUINT32; 226typedef struct { 227 __declspec(align(64)) VUINT32 SgnMask[16][1]; 228 __declspec(align(64)) VUINT32 sOne[16][1]; 229 __declspec(align(64)) VUINT32 sPoly[8][16][1]; 230 __declspec(align(64)) VUINT32 iHiDelta[16][1]; 231 __declspec(align(64)) VUINT32 iLoRange[16][1]; 232 __declspec(align(64)) VUINT32 iBrkValue[16][1]; 233 __declspec(align(64)) VUINT32 iOffExpoMask[16][1]; 234 __declspec(align(64)) VUINT32 sLn2[16][1]; 235} __svml_slog1p_data_internal; 236#endif 237__svml_slog1p_data_internal: 238 /* SgnMask */ 239 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 240 /* sOne = SP 1.0 */ 241 .align 64 242 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 243 /* sPoly[] = SP polynomial */ 244 .align 64 245 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ 246 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ 247 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ 248 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ 249 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ 250 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ 251 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ 252 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ 253 /* iHiDelta = SP 80000000-7f000000 */ 254 .align 64 255 .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000 256 /* iLoRange = SP 00800000+iHiDelta */ 257 .align 64 258 .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000 259 /* iBrkValue = SP 2/3 */ 260 .align 64 261 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab 262 /* iOffExpoMask = SP significand mask */ 263 .align 64 264 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff 265 /* sLn2 = SP ln(2) */ 266 .align 64 267 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 268 .align 64 269 .type __svml_slog1p_data_internal, @object 270 .size __svml_slog1p_data_internal, .-__svml_slog1p_data_internal 271