1/* Function sincosf vectorized with SSE4. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "svml_s_trig_data.h" 21 22 .text 23ENTRY (_ZGVbN4vl4l4_sincosf_sse4) 24/* 25 ALGORITHM DESCRIPTION: 26 27 1) Range reduction to [-Pi/4; +Pi/4] interval 28 a) Grab sign from source argument and save it. 29 b) Remove sign using AND operation 30 c) Getting octant Y by 2/Pi multiplication 31 d) Add "Right Shifter" value 32 e) Treat obtained value as integer S for destination sign setting. 33 SS = ((S-S&1)&2)<<30; For sin part 34 SC = ((S+S&1)&2)<<30; For cos part 35 f) Change destination sign if source sign is negative 36 using XOR operation. 37 g) Subtract "Right Shifter" (0x4B000000) value 38 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: 39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; 40 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) 41 a) Calculate X^2 = X * X 42 b) Calculate 2 polynomials for sin and cos: 43 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); 44 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); 45 c) Swap RS & RC if first bit of obtained value after 46 Right Shifting is set to 1. Using And, Andnot & Or operations. 47 3) Destination sign setting 48 a) Set shifted destination sign using XOR operation: 49 R1 = XOR( RS, SS ); 50 R2 = XOR( RC, SC ). */ 51 52 pushq %rbp 53 cfi_adjust_cfa_offset (8) 54 cfi_rel_offset (%rbp, 0) 55 movq %rsp, %rbp 56 cfi_def_cfa_register (%rbp) 57 andq $-64, %rsp 58 subq $320, %rsp 59 movq __svml_s_trig_data@GOTPCREL(%rip), %rax 60 movups %xmm12, 176(%rsp) 61 movups %xmm9, 160(%rsp) 62 movups __sAbsMask(%rax), %xmm12 63 64/* Absolute argument computation */ 65 movaps %xmm12, %xmm5 66 andnps %xmm0, %xmm12 67 movups __sInvPI(%rax), %xmm7 68 andps %xmm0, %xmm5 69 70/* c) Getting octant Y by 2/Pi multiplication 71 d) Add "Right Shifter" value. */ 72 mulps %xmm5, %xmm7 73 movups %xmm10, 144(%rsp) 74 movups __sPI1(%rax), %xmm10 75 76/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: 77 X = X - Y*PI1 - Y*PI2 - Y*PI3. */ 78 movaps %xmm10, %xmm1 79 addps __sRShifter(%rax), %xmm7 80 81/* e) Treat obtained value as integer S for destination sign setting */ 82 movaps %xmm7, %xmm9 83 84/* g) Subtract "Right Shifter" (0x4B000000) value */ 85 subps __sRShifter(%rax), %xmm7 86 mulps %xmm7, %xmm1 87 pslld $31, %xmm9 88 movups __sPI2(%rax), %xmm6 89 movups %xmm13, 112(%rsp) 90 movaps %xmm5, %xmm13 91 movaps %xmm6, %xmm2 92 subps %xmm1, %xmm13 93 mulps %xmm7, %xmm2 94 movups __sSignMask(%rax), %xmm3 95 movaps %xmm5, %xmm1 96 movups __sOneHalf(%rax), %xmm4 97 subps %xmm2, %xmm13 98 cmpnleps __sRangeReductionVal(%rax), %xmm5 99 movaps %xmm3, %xmm2 100 andps %xmm13, %xmm2 101 xorps %xmm2, %xmm4 102 103/* Result sign calculations */ 104 xorps %xmm2, %xmm3 105 xorps %xmm9, %xmm3 106 107/* Add correction term 0.5 for cos() part */ 108 addps %xmm7, %xmm4 109 movmskps %xmm5, %ecx 110 mulps %xmm4, %xmm10 111 mulps %xmm4, %xmm6 112 subps %xmm10, %xmm1 113 movups __sPI3(%rax), %xmm10 114 subps %xmm6, %xmm1 115 movaps %xmm10, %xmm6 116 mulps %xmm7, %xmm6 117 mulps %xmm4, %xmm10 118 subps %xmm6, %xmm13 119 subps %xmm10, %xmm1 120 movups __sPI4(%rax), %xmm6 121 mulps %xmm6, %xmm7 122 mulps %xmm6, %xmm4 123 subps %xmm7, %xmm13 124 subps %xmm4, %xmm1 125 xorps %xmm9, %xmm13 126 xorps %xmm3, %xmm1 127 movaps %xmm13, %xmm4 128 movaps %xmm1, %xmm2 129 mulps %xmm13, %xmm4 130 mulps %xmm1, %xmm2 131 movups __sA9(%rax), %xmm7 132 133/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) 134 a) Calculate X^2 = X * X 135 b) Calculate 2 polynomials for sin and cos: 136 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); 137 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ 138 movaps %xmm7, %xmm3 139 mulps %xmm4, %xmm3 140 mulps %xmm2, %xmm7 141 addps __sA7(%rax), %xmm3 142 addps __sA7(%rax), %xmm7 143 mulps %xmm4, %xmm3 144 mulps %xmm2, %xmm7 145 addps __sA5(%rax), %xmm3 146 addps __sA5(%rax), %xmm7 147 mulps %xmm4, %xmm3 148 mulps %xmm2, %xmm7 149 addps __sA3(%rax), %xmm3 150 addps __sA3(%rax), %xmm7 151 mulps %xmm3, %xmm4 152 mulps %xmm7, %xmm2 153 mulps %xmm13, %xmm4 154 mulps %xmm1, %xmm2 155 addps %xmm4, %xmm13 156 addps %xmm2, %xmm1 157 xorps %xmm12, %xmm13 158 testl %ecx, %ecx 159 jne .LBL_1_3 160 161.LBL_1_2: 162 cfi_remember_state 163 movups 160(%rsp), %xmm9 164 movaps %xmm13, (%rdi) 165 movups 144(%rsp), %xmm10 166 movups 176(%rsp), %xmm12 167 movups 112(%rsp), %xmm13 168 movups %xmm1, (%rsi) 169 movq %rbp, %rsp 170 cfi_def_cfa_register (%rsp) 171 popq %rbp 172 cfi_adjust_cfa_offset (-8) 173 cfi_restore (%rbp) 174 ret 175 176.LBL_1_3: 177 cfi_restore_state 178 movups %xmm0, 128(%rsp) 179 movups %xmm13, 192(%rsp) 180 movups %xmm1, 256(%rsp) 181 je .LBL_1_2 182 183 xorb %dl, %dl 184 xorl %eax, %eax 185 movups %xmm8, 48(%rsp) 186 movups %xmm11, 32(%rsp) 187 movups %xmm14, 16(%rsp) 188 movups %xmm15, (%rsp) 189 movq %rsi, 64(%rsp) 190 movq %r12, 104(%rsp) 191 cfi_offset_rel_rsp (12, 104) 192 movb %dl, %r12b 193 movq %r13, 96(%rsp) 194 cfi_offset_rel_rsp (13, 96) 195 movl %eax, %r13d 196 movq %r14, 88(%rsp) 197 cfi_offset_rel_rsp (14, 88) 198 movl %ecx, %r14d 199 movq %r15, 80(%rsp) 200 cfi_offset_rel_rsp (15, 80) 201 movq %rbx, 72(%rsp) 202 movq %rdi, %rbx 203 cfi_remember_state 204 205.LBL_1_6: 206 btl %r13d, %r14d 207 jc .LBL_1_13 208 209.LBL_1_7: 210 lea 1(%r13), %esi 211 btl %esi, %r14d 212 jc .LBL_1_10 213 214.LBL_1_8: 215 incb %r12b 216 addl $2, %r13d 217 cmpb $16, %r12b 218 jb .LBL_1_6 219 220 movups 48(%rsp), %xmm8 221 movq %rbx, %rdi 222 movups 32(%rsp), %xmm11 223 movups 16(%rsp), %xmm14 224 movups (%rsp), %xmm15 225 movq 64(%rsp), %rsi 226 movq 104(%rsp), %r12 227 cfi_restore (%r12) 228 movq 96(%rsp), %r13 229 cfi_restore (%r13) 230 movq 88(%rsp), %r14 231 cfi_restore (%r14) 232 movq 80(%rsp), %r15 233 cfi_restore (%r15) 234 movq 72(%rsp), %rbx 235 movups 192(%rsp), %xmm13 236 movups 256(%rsp), %xmm1 237 jmp .LBL_1_2 238 239.LBL_1_10: 240 cfi_restore_state 241 movzbl %r12b, %r15d 242 movss 132(%rsp,%r15,8), %xmm0 243 244 call JUMPTARGET(sinf) 245 246 movss %xmm0, 196(%rsp,%r15,8) 247 movss 132(%rsp,%r15,8), %xmm0 248 249 call JUMPTARGET(cosf) 250 251 movss %xmm0, 260(%rsp,%r15,8) 252 jmp .LBL_1_8 253 254.LBL_1_13: 255 movzbl %r12b, %r15d 256 movss 128(%rsp,%r15,8), %xmm0 257 258 call JUMPTARGET(sinf) 259 260 movss %xmm0, 192(%rsp,%r15,8) 261 movss 128(%rsp,%r15,8), %xmm0 262 263 call JUMPTARGET(cosf) 264 265 movss %xmm0, 256(%rsp,%r15,8) 266 jmp .LBL_1_7 267 268END (_ZGVbN4vl4l4_sincosf_sse4) 269libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4) 270 271/* vvv version implemented with wrapper to vl4l4 variant. */ 272ENTRY (_ZGVbN4vvv_sincosf_sse4) 273#ifndef __ILP32__ 274 subq $104, %rsp 275 .cfi_def_cfa_offset 112 276 movdqu %xmm1, 32(%rsp) 277 lea (%rsp), %rdi 278 movdqu %xmm2, 48(%rdi) 279 lea 16(%rsp), %rsi 280 movdqu %xmm3, 48(%rsi) 281 movdqu %xmm4, 64(%rsi) 282 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) 283 movq 32(%rsp), %rdx 284 movq 40(%rsp), %rsi 285 movq 48(%rsp), %r8 286 movq 56(%rsp), %r10 287 movl (%rsp), %eax 288 movl 4(%rsp), %ecx 289 movl 8(%rsp), %edi 290 movl 12(%rsp), %r9d 291 movl %eax, (%rdx) 292 movl %ecx, (%rsi) 293 movq 64(%rsp), %rax 294 movq 72(%rsp), %rcx 295 movl %edi, (%r8) 296 movl %r9d, (%r10) 297 movq 80(%rsp), %rdi 298 movq 88(%rsp), %r9 299 movl 16(%rsp), %r11d 300 movl 20(%rsp), %edx 301 movl 24(%rsp), %esi 302 movl 28(%rsp), %r8d 303 movl %r11d, (%rax) 304 movl %edx, (%rcx) 305 movl %esi, (%rdi) 306 movl %r8d, (%r9) 307 addq $104, %rsp 308 .cfi_def_cfa_offset 8 309 ret 310#else 311 subl $72, %esp 312 .cfi_def_cfa_offset 80 313 leal 48(%rsp), %esi 314 movaps %xmm1, 16(%esp) 315 leal 32(%rsp), %edi 316 movaps %xmm2, (%esp) 317 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) 318 movl 16(%esp), %eax 319 movss 32(%esp), %xmm0 320 movss %xmm0, (%eax) 321 movl 20(%esp), %eax 322 movss 36(%esp), %xmm0 323 movss %xmm0, (%eax) 324 movl 24(%esp), %eax 325 movss 40(%esp), %xmm0 326 movss %xmm0, (%eax) 327 movl 28(%esp), %eax 328 movss 44(%esp), %xmm0 329 movss %xmm0, (%eax) 330 movl (%esp), %eax 331 movss 48(%esp), %xmm0 332 movss %xmm0, (%eax) 333 movl 4(%esp), %eax 334 movss 52(%esp), %xmm0 335 movss %xmm0, (%eax) 336 movl 8(%esp), %eax 337 movss 56(%esp), %xmm0 338 movss %xmm0, (%eax) 339 movl 12(%esp), %eax 340 movss 60(%esp), %xmm0 341 movss %xmm0, (%eax) 342 addl $72, %esp 343 .cfi_def_cfa_offset 8 344 ret 345#endif 346END (_ZGVbN4vvv_sincosf_sse4) 347