1/* Function asinf vectorized with SSE4. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * SelMask = (|x| >= 0.5) ? 1 : 0; 23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| 24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x) 25 * 26 * 27 */ 28 29/* Offsets for data table __svml_sasin_data_internal 30 */ 31#define AbsMask 0 32#define OneHalf 16 33#define SmallNorm 32 34#define One 48 35#define Two 64 36#define sqrt_coeff 80 37#define poly_coeff 112 38#define Pi2H 192 39 40#include <sysdep.h> 41 42 .section .text.sse4, "ax", @progbits 43ENTRY(_ZGVbN4v_asinf_sse4) 44 subq $72, %rsp 45 cfi_def_cfa_offset(80) 46 movaps %xmm0, %xmm2 47 movups __svml_sasin_data_internal(%rip), %xmm1 48 movups OneHalf+__svml_sasin_data_internal(%rip), %xmm5 49 50 /* x = |arg| */ 51 movaps %xmm1, %xmm0 52 andps %xmm2, %xmm0 53 54 /* Y = 0.5 - 0.5*x */ 55 movaps %xmm5, %xmm3 56 mulps %xmm0, %xmm3 57 movaps %xmm5, %xmm8 58 59 /* x^2 */ 60 movaps %xmm0, %xmm14 61 movaps %xmm0, %xmm15 62 mulps %xmm0, %xmm14 63 subps %xmm3, %xmm8 64 cmpnltps %xmm5, %xmm15 65 66 /* SQ ~ -2*sqrt(Y) */ 67 rsqrtps %xmm8, %xmm6 68 minps %xmm8, %xmm14 69 movaps %xmm8, %xmm9 70 movaps %xmm14, %xmm10 71 cmpltps SmallNorm+__svml_sasin_data_internal(%rip), %xmm9 72 mulps %xmm14, %xmm10 73 addps %xmm8, %xmm8 74 andnps %xmm6, %xmm9 75 movaps %xmm15, %xmm3 76 movaps %xmm9, %xmm7 77 andnps %xmm0, %xmm3 78 mulps %xmm9, %xmm7 79 andnps %xmm2, %xmm1 80 mulps %xmm8, %xmm9 81 mulps %xmm7, %xmm8 82 83 /* polynomial */ 84 movups poly_coeff+__svml_sasin_data_internal(%rip), %xmm11 85 mulps %xmm14, %xmm11 86 subps Two+__svml_sasin_data_internal(%rip), %xmm8 87 movups poly_coeff+32+__svml_sasin_data_internal(%rip), %xmm12 88 mulps %xmm14, %xmm12 89 addps poly_coeff+16+__svml_sasin_data_internal(%rip), %xmm11 90 mulps %xmm10, %xmm11 91 addps poly_coeff+48+__svml_sasin_data_internal(%rip), %xmm12 92 movups sqrt_coeff+__svml_sasin_data_internal(%rip), %xmm13 93 addps %xmm11, %xmm12 94 mulps %xmm8, %xmm13 95 mulps %xmm9, %xmm8 96 mulps %xmm14, %xmm12 97 addps sqrt_coeff+16+__svml_sasin_data_internal(%rip), %xmm13 98 addps poly_coeff+64+__svml_sasin_data_internal(%rip), %xmm12 99 mulps %xmm8, %xmm13 100 mulps %xmm12, %xmm14 101 subps %xmm9, %xmm13 102 andps %xmm15, %xmm13 103 orps %xmm13, %xmm3 104 mulps %xmm3, %xmm14 105 movups One+__svml_sasin_data_internal(%rip), %xmm4 106 addps %xmm14, %xmm3 107 cmpltps %xmm0, %xmm4 108 movups Pi2H+__svml_sasin_data_internal(%rip), %xmm0 109 andps %xmm15, %xmm0 110 movmskps %xmm4, %edx 111 addps %xmm3, %xmm0 112 pxor %xmm1, %xmm0 113 testl %edx, %edx 114 115 /* Go to special inputs processing branch */ 116 jne L(SPECIAL_VALUES_BRANCH) 117 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2 118 119 /* Restore registers 120 * and exit the function 121 */ 122 123L(EXIT): 124 addq $72, %rsp 125 cfi_def_cfa_offset(8) 126 ret 127 cfi_def_cfa_offset(80) 128 129 /* Branch to process 130 * special inputs 131 */ 132 133L(SPECIAL_VALUES_BRANCH): 134 movups %xmm2, 32(%rsp) 135 movups %xmm0, 48(%rsp) 136 # LOE rbx rbp r12 r13 r14 r15 edx 137 138 xorl %eax, %eax 139 movq %r12, 16(%rsp) 140 cfi_offset(12, -64) 141 movl %eax, %r12d 142 movq %r13, 8(%rsp) 143 cfi_offset(13, -72) 144 movl %edx, %r13d 145 movq %r14, (%rsp) 146 cfi_offset(14, -80) 147 # LOE rbx rbp r15 r12d r13d 148 149 /* Range mask 150 * bits check 151 */ 152 153L(RANGEMASK_CHECK): 154 btl %r12d, %r13d 155 156 /* Call scalar math function */ 157 jc L(SCALAR_MATH_CALL) 158 # LOE rbx rbp r15 r12d r13d 159 160 /* Special inputs 161 * processing loop 162 */ 163 164L(SPECIAL_VALUES_LOOP): 165 incl %r12d 166 cmpl $4, %r12d 167 168 /* Check bits in range mask */ 169 jl L(RANGEMASK_CHECK) 170 # LOE rbx rbp r15 r12d r13d 171 172 movq 16(%rsp), %r12 173 cfi_restore(12) 174 movq 8(%rsp), %r13 175 cfi_restore(13) 176 movq (%rsp), %r14 177 cfi_restore(14) 178 movups 48(%rsp), %xmm0 179 180 /* Go to exit */ 181 jmp L(EXIT) 182 cfi_offset(12, -64) 183 cfi_offset(13, -72) 184 cfi_offset(14, -80) 185 # LOE rbx rbp r12 r13 r14 r15 xmm0 186 187 /* Scalar math fucntion call 188 * to process special input 189 */ 190 191L(SCALAR_MATH_CALL): 192 movl %r12d, %r14d 193 movss 32(%rsp, %r14, 4), %xmm0 194 call asinf@PLT 195 # LOE rbx rbp r14 r15 r12d r13d xmm0 196 197 movss %xmm0, 48(%rsp, %r14, 4) 198 199 /* Process special inputs in loop */ 200 jmp L(SPECIAL_VALUES_LOOP) 201 # LOE rbx rbp r15 r12d r13d 202END(_ZGVbN4v_asinf_sse4) 203 204 .section .rodata, "a" 205 .align 16 206 207#ifdef __svml_sasin_data_internal_typedef 208typedef unsigned int VUINT32; 209typedef struct { 210 __declspec(align(16)) VUINT32 AbsMask[4][1]; 211 __declspec(align(16)) VUINT32 OneHalf[4][1]; 212 __declspec(align(16)) VUINT32 SmallNorm[4][1]; 213 __declspec(align(16)) VUINT32 One[4][1]; 214 __declspec(align(16)) VUINT32 Two[4][1]; 215 __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1]; 216 __declspec(align(16)) VUINT32 poly_coeff[5][4][1]; 217 __declspec(align(16)) VUINT32 Pi2H[4][1]; 218} __svml_sasin_data_internal; 219#endif 220__svml_sasin_data_internal: 221 /* AbsMask */ 222 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 223 /* OneHalf */ 224 .align 16 225 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 226 /* SmallNorm */ 227 .align 16 228 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 229 /* One */ 230 .align 16 231 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 232 /* Two */ 233 .align 16 234 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000 235 /* sqrt_coeff[2] */ 236 .align 16 237 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ 238 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ 239 /* poly_coeff[5] */ 240 .align 16 241 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ 242 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ 243 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ 244 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ 245 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ 246 /* Pi2H */ 247 .align 16 248 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB 249 .align 16 250 .type __svml_sasin_data_internal, @object 251 .size __svml_sasin_data_internal, .-__svml_sasin_data_internal 252