1/* Function acosf vectorized with SSE4. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * SelMask = (|x| >= 0.5) ? 1 : 0; 23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| 24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) 25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) 26 * 27 * 28 */ 29 30/* Offsets for data table __svml_sacos_data_internal 31 */ 32#define SgnBit 0 33#define OneHalf 16 34#define SmallNorm 32 35#define MOne 48 36#define Two 64 37#define sqrt_coeff 80 38#define poly_coeff 112 39#define Pi2H 192 40#define PiH 208 41 42#include <sysdep.h> 43 44 .section .text.sse4, "ax", @progbits 45ENTRY(_ZGVbN4v_acosf_sse4) 46 subq $72, %rsp 47 cfi_def_cfa_offset(80) 48 49 /* X<X^2 iff X<0 */ 50 movaps %xmm0, %xmm14 51 52 /* 53 * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) 54 * SQ ~ 2*sqrt(X) 55 */ 56 movups __svml_sacos_data_internal(%rip), %xmm3 57 movups OneHalf+__svml_sacos_data_internal(%rip), %xmm5 58 59 /* x = -|arg| */ 60 movaps %xmm3, %xmm4 61 orps %xmm0, %xmm4 62 63 /* Y = 0.5 + 0.5*(-x) */ 64 movaps %xmm5, %xmm6 65 mulps %xmm4, %xmm6 66 67 /* x^2 */ 68 movaps %xmm4, %xmm13 69 mulps %xmm4, %xmm13 70 addps %xmm6, %xmm5 71 72 /* SQ ~ 2*sqrt(Y) */ 73 rsqrtps %xmm5, %xmm8 74 minps %xmm5, %xmm13 75 movaps %xmm5, %xmm2 76 movaps %xmm13, %xmm1 77 cmpltps SmallNorm+__svml_sacos_data_internal(%rip), %xmm2 78 cmpnltps %xmm5, %xmm1 79 cmpltps %xmm13, %xmm14 80 addps %xmm5, %xmm5 81 andnps %xmm8, %xmm2 82 movaps %xmm13, %xmm11 83 movaps %xmm2, %xmm9 84 movaps %xmm1, %xmm6 85 mulps %xmm2, %xmm9 86 andnps %xmm4, %xmm6 87 mulps %xmm5, %xmm2 88 mulps %xmm13, %xmm11 89 mulps %xmm9, %xmm5 90 movups sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10 91 andps %xmm0, %xmm3 92 93 /* polynomial */ 94 movups poly_coeff+__svml_sacos_data_internal(%rip), %xmm12 95 movaps %xmm1, %xmm15 96 mulps %xmm13, %xmm12 97 subps Two+__svml_sacos_data_internal(%rip), %xmm5 98 mulps %xmm5, %xmm10 99 addps poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12 100 mulps %xmm2, %xmm5 101 mulps %xmm11, %xmm12 102 addps sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10 103 mulps %xmm5, %xmm10 104 movups poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5 105 subps %xmm10, %xmm2 106 mulps %xmm13, %xmm5 107 movups MOne+__svml_sacos_data_internal(%rip), %xmm7 108 andps %xmm1, %xmm2 109 cmpnleps %xmm4, %xmm7 110 addps poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5 111 movmskps %xmm7, %edx 112 orps %xmm2, %xmm6 113 addps %xmm12, %xmm5 114 mulps %xmm13, %xmm5 115 pxor %xmm3, %xmm6 116 movups PiH+__svml_sacos_data_internal(%rip), %xmm7 117 andps %xmm1, %xmm7 118 addps poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5 119 mulps %xmm13, %xmm5 120 andps %xmm14, %xmm7 121 mulps %xmm6, %xmm5 122 andnps Pi2H+__svml_sacos_data_internal(%rip), %xmm15 123 addps %xmm5, %xmm6 124 addps %xmm15, %xmm7 125 addps %xmm6, %xmm7 126 testl %edx, %edx 127 128 /* Go to special inputs processing branch */ 129 jne L(SPECIAL_VALUES_BRANCH) 130 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm7 131 132 /* Restore registers 133 * and exit the function 134 */ 135 136L(EXIT): 137 movaps %xmm7, %xmm0 138 addq $72, %rsp 139 cfi_def_cfa_offset(8) 140 ret 141 cfi_def_cfa_offset(80) 142 143 /* Branch to process 144 * special inputs 145 */ 146 147L(SPECIAL_VALUES_BRANCH): 148 movups %xmm0, 32(%rsp) 149 movups %xmm7, 48(%rsp) 150 # LOE rbx rbp r12 r13 r14 r15 edx 151 152 xorl %eax, %eax 153 movq %r12, 16(%rsp) 154 cfi_offset(12, -64) 155 movl %eax, %r12d 156 movq %r13, 8(%rsp) 157 cfi_offset(13, -72) 158 movl %edx, %r13d 159 movq %r14, (%rsp) 160 cfi_offset(14, -80) 161 # LOE rbx rbp r15 r12d r13d 162 163 /* Range mask 164 * bits check 165 */ 166 167L(RANGEMASK_CHECK): 168 btl %r12d, %r13d 169 170 /* Call scalar math function */ 171 jc L(SCALAR_MATH_CALL) 172 # LOE rbx rbp r15 r12d r13d 173 174 /* Special inputs 175 * processing loop 176 */ 177 178L(SPECIAL_VALUES_LOOP): 179 incl %r12d 180 cmpl $4, %r12d 181 182 /* Check bits in range mask */ 183 jl L(RANGEMASK_CHECK) 184 # LOE rbx rbp r15 r12d r13d 185 186 movq 16(%rsp), %r12 187 cfi_restore(12) 188 movq 8(%rsp), %r13 189 cfi_restore(13) 190 movq (%rsp), %r14 191 cfi_restore(14) 192 movups 48(%rsp), %xmm7 193 194 /* Go to exit */ 195 jmp L(EXIT) 196 cfi_offset(12, -64) 197 cfi_offset(13, -72) 198 cfi_offset(14, -80) 199 # LOE rbx rbp r12 r13 r14 r15 xmm7 200 201 /* Scalar math fucntion call 202 * to process special input 203 */ 204 205L(SCALAR_MATH_CALL): 206 movl %r12d, %r14d 207 movss 32(%rsp, %r14, 4), %xmm0 208 call acosf@PLT 209 # LOE rbx rbp r14 r15 r12d r13d xmm0 210 211 movss %xmm0, 48(%rsp, %r14, 4) 212 213 /* Process special inputs in loop */ 214 jmp L(SPECIAL_VALUES_LOOP) 215 # LOE rbx rbp r15 r12d r13d 216END(_ZGVbN4v_acosf_sse4) 217 218 .section .rodata, "a" 219 .align 16 220 221#ifdef __svml_sacos_data_internal_typedef 222typedef unsigned int VUINT32; 223typedef struct { 224 __declspec(align(16)) VUINT32 SgnBit[4][1]; 225 __declspec(align(16)) VUINT32 OneHalf[4][1]; 226 __declspec(align(16)) VUINT32 SmallNorm[4][1]; 227 __declspec(align(16)) VUINT32 MOne[4][1]; 228 __declspec(align(16)) VUINT32 Two[4][1]; 229 __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1]; 230 __declspec(align(16)) VUINT32 poly_coeff[5][4][1]; 231 __declspec(align(16)) VUINT32 Pi2H[4][1]; 232 __declspec(align(16)) VUINT32 PiH[4][1]; 233} __svml_sacos_data_internal; 234#endif 235__svml_sacos_data_internal: 236 /* SgnBit */ 237 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 238 /* OneHalf */ 239 .align 16 240 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 241 /* SmallNorm */ 242 .align 16 243 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 244 /* MOne */ 245 .align 16 246 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 247 /* Two */ 248 .align 16 249 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000 250 /* sqrt_coeff[2] */ 251 .align 16 252 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ 253 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ 254 /* poly_coeff[5] */ 255 .align 16 256 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ 257 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ 258 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ 259 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ 260 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ 261 /* Pi2H */ 262 .align 16 263 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB 264 /* PiH */ 265 .align 16 266 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB 267 .align 16 268 .type __svml_sacos_data_internal, @object 269 .size __svml_sacos_data_internal, .-__svml_sacos_data_internal 270