1/* Function logf vectorized with AVX2. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "svml_s_logf_data.h" 21 22 .text 23ENTRY(_ZGVdN8v_logf_avx2) 24/* 25 ALGORITHM DESCRIPTION: 26 27 log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3 28 log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3 29 30 R = mantissa_x - 1, if mantissa_x<4/3 31 R = 0.5*mantissa_x - 1, if mantissa_x>4/3 32 |R|< 1/3 33 34 log(1+R) is approximated as a polynomial: degree 9 for 1-ulp, 35 degree 7 for 4-ulp, degree 3 for half-precision. */ 36 37 pushq %rbp 38 cfi_adjust_cfa_offset (8) 39 cfi_rel_offset (%rbp, 0) 40 movq %rsp, %rbp 41 cfi_def_cfa_register (%rbp) 42 andq $-64, %rsp 43 subq $448, %rsp 44 movq __svml_slog_data@GOTPCREL(%rip), %rax 45 vmovaps %ymm0, %ymm2 46 vmovups _iBrkValue(%rax), %ymm6 47 vmovups _iLoRange(%rax), %ymm1 48/* check for working range, 49 set special argument mask (denormals/zero/Inf/NaN) */ 50 vpaddd _iHiDelta(%rax), %ymm2, %ymm7 51 52/* reduction: compute r,n */ 53 vpsubd %ymm6, %ymm2, %ymm4 54 55/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */ 56 vpsrad $23, %ymm4, %ymm3 57 vpand _iOffExpoMask(%rax), %ymm4, %ymm5 58 vmovups _sPoly_7(%rax), %ymm4 59 vcvtdq2ps %ymm3, %ymm0 60 61/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */ 62 vpaddd %ymm6, %ymm5, %ymm3 63 64/* reduced argument R */ 65 vsubps _sOne(%rax), %ymm3, %ymm5 66 67/* polynomial evaluation starts here */ 68 vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4 69 vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4 70 vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4 71 vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4 72 vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4 73 vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4 74 vmulps %ymm5, %ymm4, %ymm6 75 76/* polynomial evaluation end */ 77 vfmadd213ps %ymm5, %ymm5, %ymm6 78 vpcmpgtd %ymm7, %ymm1, %ymm1 79 vmovmskps %ymm1, %ecx 80 81/* final reconstruction: 82 add exponent_value*log2 to polynomial result */ 83 vfmadd132ps _sLn2(%rax), %ymm6, %ymm0 84 testl %ecx, %ecx 85 jne .LBL_1_3 86 87.LBL_1_2: 88 cfi_remember_state 89 movq %rbp, %rsp 90 cfi_def_cfa_register (%rsp) 91 popq %rbp 92 cfi_adjust_cfa_offset (-8) 93 cfi_restore (%rbp) 94 ret 95 96.LBL_1_3: 97 cfi_restore_state 98 vmovups %ymm2, 320(%rsp) 99 vmovups %ymm0, 384(%rsp) 100 je .LBL_1_2 101 102 xorb %dl, %dl 103 xorl %eax, %eax 104 vmovups %ymm8, 224(%rsp) 105 vmovups %ymm9, 192(%rsp) 106 vmovups %ymm10, 160(%rsp) 107 vmovups %ymm11, 128(%rsp) 108 vmovups %ymm12, 96(%rsp) 109 vmovups %ymm13, 64(%rsp) 110 vmovups %ymm14, 32(%rsp) 111 vmovups %ymm15, (%rsp) 112 movq %rsi, 264(%rsp) 113 movq %rdi, 256(%rsp) 114 movq %r12, 296(%rsp) 115 cfi_offset_rel_rsp (12, 296) 116 movb %dl, %r12b 117 movq %r13, 288(%rsp) 118 cfi_offset_rel_rsp (13, 288) 119 movl %ecx, %r13d 120 movq %r14, 280(%rsp) 121 cfi_offset_rel_rsp (14, 280) 122 movl %eax, %r14d 123 movq %r15, 272(%rsp) 124 cfi_offset_rel_rsp (15, 272) 125 cfi_remember_state 126 127.LBL_1_6: 128 btl %r14d, %r13d 129 jc .LBL_1_12 130 131.LBL_1_7: 132 lea 1(%r14), %esi 133 btl %esi, %r13d 134 jc .LBL_1_10 135 136.LBL_1_8: 137 incb %r12b 138 addl $2, %r14d 139 cmpb $16, %r12b 140 jb .LBL_1_6 141 142 vmovups 224(%rsp), %ymm8 143 vmovups 192(%rsp), %ymm9 144 vmovups 160(%rsp), %ymm10 145 vmovups 128(%rsp), %ymm11 146 vmovups 96(%rsp), %ymm12 147 vmovups 64(%rsp), %ymm13 148 vmovups 32(%rsp), %ymm14 149 vmovups (%rsp), %ymm15 150 vmovups 384(%rsp), %ymm0 151 movq 264(%rsp), %rsi 152 movq 256(%rsp), %rdi 153 movq 296(%rsp), %r12 154 cfi_restore (%r12) 155 movq 288(%rsp), %r13 156 cfi_restore (%r13) 157 movq 280(%rsp), %r14 158 cfi_restore (%r14) 159 movq 272(%rsp), %r15 160 cfi_restore (%r15) 161 jmp .LBL_1_2 162 163.LBL_1_10: 164 cfi_restore_state 165 movzbl %r12b, %r15d 166 vmovss 324(%rsp,%r15,8), %xmm0 167 vzeroupper 168 169 call JUMPTARGET(logf) 170 171 vmovss %xmm0, 388(%rsp,%r15,8) 172 jmp .LBL_1_8 173 174.LBL_1_12: 175 movzbl %r12b, %r15d 176 vmovss 320(%rsp,%r15,8), %xmm0 177 vzeroupper 178 179 call JUMPTARGET(logf) 180 181 vmovss %xmm0, 384(%rsp,%r15,8) 182 jmp .LBL_1_7 183 184END(_ZGVdN8v_logf_avx2) 185