1/* Function log vectorized with AVX2. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "svml_d_log_data.h" 21 22 .text 23ENTRY (_ZGVdN4v_log_avx2) 24/* ALGORITHM DESCRIPTION: 25 26 log(x) = -log(Rcp) + log(Rcp*x), 27 where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding 28 HW approximation to 1+9 mantissa bits) 29 30 Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial 31 32 log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp) 33 -log(mantissa_Rcp) is obtained from a lookup table, 34 accessed by a 9-bit index 35 */ 36 pushq %rbp 37 cfi_adjust_cfa_offset (8) 38 cfi_rel_offset (%rbp, 0) 39 movq %rsp, %rbp 40 cfi_def_cfa_register (%rbp) 41 andq $-64, %rsp 42 subq $448, %rsp 43 movq __svml_dlog_data@GOTPCREL(%rip), %rax 44 vmovdqa %ymm0, %ymm5 45 46/* isolate exponent bits */ 47 vpsrlq $20, %ymm5, %ymm0 48 49/* preserve mantissa, set input exponent to 2^(-10) */ 50 vandpd _ExpMask(%rax), %ymm5, %ymm6 51 vorpd _Two10(%rax), %ymm6, %ymm4 52 53/* reciprocal approximation good to at least 11 bits */ 54 vcvtpd2ps %ymm4, %xmm7 55 vrcpps %xmm7, %xmm1 56 vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7 57 vcvtps2pd %xmm1, %ymm3 58 vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1 59 vextracti128 $1, %ymm0, %xmm2 60 vshufps $221, %xmm2, %xmm0, %xmm6 61 62/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ 63 vroundpd $0, %ymm3, %ymm2 64 65/* convert biased exponent to DP format */ 66 vcvtdq2pd %xmm6, %ymm0 67 68/* combine and get argument value range mask */ 69 vorpd %ymm1, %ymm7, %ymm3 70 vmovupd _One(%rax), %ymm1 71 vmovmskpd %ymm3, %ecx 72 73/* calculate index for table lookup */ 74 vpsrlq $40, %ymm2, %ymm3 75 76/* argument reduction started: R = Mantissa*Rcp - 1 */ 77 vfmsub213pd %ymm1, %ymm2, %ymm4 78 vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2 79 vpcmpeqd %ymm6, %ymm6, %ymm6 80 vxorpd %ymm1, %ymm1, %ymm1 81 vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1 82 83/* exponent*log(2.0) */ 84 vmovupd _poly_coeff_1(%rax), %ymm6 85 vmulpd %ymm4, %ymm4, %ymm3 86 87/* polynomial computation */ 88 vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6 89 vandpd _Bias(%rax), %ymm2, %ymm7 90 vorpd _Bias1(%rax), %ymm7, %ymm2 91 92/* 93 Table stores -log(0.5*mantissa) for larger mantissas, 94 adjust exponent accordingly 95 */ 96 vsubpd %ymm2, %ymm0, %ymm0 97 vmovupd _poly_coeff_3(%rax), %ymm2 98 vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2 99 vfmadd213pd %ymm2, %ymm3, %ymm6 100 101/* 102 reconstruction: 103 (exponent*log(2)) + (LogRcp + (R+poly)) 104 */ 105 vfmadd213pd %ymm4, %ymm3, %ymm6 106 vaddpd %ymm1, %ymm6, %ymm4 107 vfmadd132pd _L2(%rax), %ymm4, %ymm0 108 testl %ecx, %ecx 109 jne .LBL_1_3 110 111.LBL_1_2: 112 cfi_remember_state 113 movq %rbp, %rsp 114 cfi_def_cfa_register (%rsp) 115 popq %rbp 116 cfi_adjust_cfa_offset (-8) 117 cfi_restore (%rbp) 118 ret 119 120.LBL_1_3: 121 cfi_restore_state 122 vmovupd %ymm5, 320(%rsp) 123 vmovupd %ymm0, 384(%rsp) 124 je .LBL_1_2 125 126 xorb %dl, %dl 127 xorl %eax, %eax 128 vmovups %ymm8, 224(%rsp) 129 vmovups %ymm9, 192(%rsp) 130 vmovups %ymm10, 160(%rsp) 131 vmovups %ymm11, 128(%rsp) 132 vmovups %ymm12, 96(%rsp) 133 vmovups %ymm13, 64(%rsp) 134 vmovups %ymm14, 32(%rsp) 135 vmovups %ymm15, (%rsp) 136 movq %rsi, 264(%rsp) 137 movq %rdi, 256(%rsp) 138 movq %r12, 296(%rsp) 139 cfi_offset_rel_rsp (12, 296) 140 movb %dl, %r12b 141 movq %r13, 288(%rsp) 142 cfi_offset_rel_rsp (13, 288) 143 movl %ecx, %r13d 144 movq %r14, 280(%rsp) 145 cfi_offset_rel_rsp (14, 280) 146 movl %eax, %r14d 147 movq %r15, 272(%rsp) 148 cfi_offset_rel_rsp (15, 272) 149 cfi_remember_state 150 151.LBL_1_6: 152 btl %r14d, %r13d 153 jc .LBL_1_12 154 155.LBL_1_7: 156 lea 1(%r14), %esi 157 btl %esi, %r13d 158 jc .LBL_1_10 159 160.LBL_1_8: 161 incb %r12b 162 addl $2, %r14d 163 cmpb $16, %r12b 164 jb .LBL_1_6 165 166 vmovups 224(%rsp), %ymm8 167 vmovups 192(%rsp), %ymm9 168 vmovups 160(%rsp), %ymm10 169 vmovups 128(%rsp), %ymm11 170 vmovups 96(%rsp), %ymm12 171 vmovups 64(%rsp), %ymm13 172 vmovups 32(%rsp), %ymm14 173 vmovups (%rsp), %ymm15 174 vmovupd 384(%rsp), %ymm0 175 movq 264(%rsp), %rsi 176 movq 256(%rsp), %rdi 177 movq 296(%rsp), %r12 178 cfi_restore (%r12) 179 movq 288(%rsp), %r13 180 cfi_restore (%r13) 181 movq 280(%rsp), %r14 182 cfi_restore (%r14) 183 movq 272(%rsp), %r15 184 cfi_restore (%r15) 185 jmp .LBL_1_2 186 187.LBL_1_10: 188 cfi_restore_state 189 movzbl %r12b, %r15d 190 shlq $4, %r15 191 vmovsd 328(%rsp,%r15), %xmm0 192 vzeroupper 193 194 call JUMPTARGET(log) 195 196 vmovsd %xmm0, 392(%rsp,%r15) 197 jmp .LBL_1_8 198 199.LBL_1_12: 200 movzbl %r12b, %r15d 201 shlq $4, %r15 202 vmovsd 320(%rsp,%r15), %xmm0 203 vzeroupper 204 205 call JUMPTARGET(log) 206 207 vmovsd %xmm0, 384(%rsp,%r15) 208 jmp .LBL_1_7 209 210END (_ZGVdN4v_log_avx2) 211