1/* Function cosf vectorized with AVX2. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19 20#include <sysdep.h> 21#include "svml_s_trig_data.h" 22 23 .text 24ENTRY (_ZGVdN8v_cosf_avx2) 25/* 26 ALGORITHM DESCRIPTION: 27 28 1) Range reduction to [-Pi/2; +Pi/2] interval 29 a) We remove sign using AND operation 30 b) Add Pi/2 value to argument X for Cos to Sin transformation 31 c) Getting octant Y by 1/Pi multiplication 32 d) Add "Right Shifter" value 33 e) Treat obtained value as integer for destination sign setting. 34 Shift first bit of this value to the last (sign) position 35 f) Subtract "Right Shifter" value 36 g) Subtract 0.5 from result for octant correction 37 h) Subtract Y*PI from X argument, where PI divided to 4 parts: 38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; 39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval) 40 a) Calculate X^2 = X * X 41 b) Calculate polynomial: 42 R = X + X * X^2 * (A3 + x^2 * (A5 + ..... 43 3) Destination sign setting 44 a) Set shifted destination sign using XOR operation: 45 R = XOR( R, S ); 46 */ 47 pushq %rbp 48 cfi_adjust_cfa_offset (8) 49 cfi_rel_offset (%rbp, 0) 50 movq %rsp, %rbp 51 cfi_def_cfa_register (%rbp) 52 andq $-64, %rsp 53 subq $448, %rsp 54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax 55 vmovaps %ymm0, %ymm2 56 vmovups __sRShifter(%rax), %ymm5 57 vmovups __sPI1_FMA(%rax), %ymm7 58 59/* b) Add Pi/2 value to argument X for Cos to Sin transformation */ 60 vaddps __sHalfPI(%rax), %ymm2, %ymm4 61 62/* 63 1) Range reduction to [-Pi/2; +Pi/2] interval 64 c) Getting octant Y by 1/Pi multiplication 65 d) Add "Right Shifter" (0x4B000000) value 66 */ 67 vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4 68 69/* f) Subtract "Right Shifter" (0x4B000000) value */ 70 vsubps %ymm5, %ymm4, %ymm6 71 72/* 73 e) Treat obtained value as integer for destination sign setting. 74 Shift first bit of this value to the last (sign) position (S << 31) 75 */ 76 vpslld $31, %ymm4, %ymm0 77 78/* g) Subtract 0.5 from result for octant correction */ 79 vsubps __sOneHalf(%rax), %ymm6, %ymm4 80 81/* Check for large and special arguments */ 82 vandps __sAbsMask(%rax), %ymm2, %ymm3 83 vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1 84 85/* 86 h) Subtract Y*PI from X argument, where PI divided to 4 parts: 87 X = X - Y*PI1 - Y*PI2 - Y*PI3 88 */ 89 vmovaps %ymm2, %ymm3 90 vfnmadd231ps %ymm4, %ymm7, %ymm3 91 vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3 92 vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4 93 94/* a) Calculate X^2 = X * X */ 95 vmulps %ymm4, %ymm4, %ymm5 96 97/* 98 3) Destination sign setting 99 a) Set shifted destination sign using XOR operation: 100 R = XOR( R, S ); 101 */ 102 vxorps %ymm0, %ymm4, %ymm6 103 vmovups __sA9_FMA(%rax), %ymm0 104 105/* 106 b) Calculate polynomial: 107 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9)))) 108 */ 109 vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0 110 vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0 111 vfmadd213ps __sA3(%rax), %ymm5, %ymm0 112 vmulps %ymm5, %ymm0, %ymm0 113 vmovmskps %ymm1, %ecx 114 vfmadd213ps %ymm6, %ymm6, %ymm0 115 testl %ecx, %ecx 116 jne .LBL_1_3 117 118.LBL_1_2: 119 cfi_remember_state 120 movq %rbp, %rsp 121 cfi_def_cfa_register (%rsp) 122 popq %rbp 123 cfi_adjust_cfa_offset (-8) 124 cfi_restore (%rbp) 125 ret 126 127.LBL_1_3: 128 cfi_restore_state 129 vmovups %ymm2, 320(%rsp) 130 vmovups %ymm0, 384(%rsp) 131 je .LBL_1_2 132 133 xorb %dl, %dl 134 xorl %eax, %eax 135 vmovups %ymm8, 224(%rsp) 136 vmovups %ymm9, 192(%rsp) 137 vmovups %ymm10, 160(%rsp) 138 vmovups %ymm11, 128(%rsp) 139 vmovups %ymm12, 96(%rsp) 140 vmovups %ymm13, 64(%rsp) 141 vmovups %ymm14, 32(%rsp) 142 vmovups %ymm15, (%rsp) 143 movq %rsi, 264(%rsp) 144 movq %rdi, 256(%rsp) 145 movq %r12, 296(%rsp) 146 cfi_offset_rel_rsp (12, 296) 147 movb %dl, %r12b 148 movq %r13, 288(%rsp) 149 cfi_offset_rel_rsp (13, 288) 150 movl %ecx, %r13d 151 movq %r14, 280(%rsp) 152 cfi_offset_rel_rsp (14, 280) 153 movl %eax, %r14d 154 movq %r15, 272(%rsp) 155 cfi_offset_rel_rsp (15, 272) 156 cfi_remember_state 157 158.LBL_1_6: 159 btl %r14d, %r13d 160 jc .LBL_1_12 161 162.LBL_1_7: 163 lea 1(%r14), %esi 164 btl %esi, %r13d 165 jc .LBL_1_10 166 167.LBL_1_8: 168 incb %r12b 169 addl $2, %r14d 170 cmpb $16, %r12b 171 jb .LBL_1_6 172 173 vmovups 224(%rsp), %ymm8 174 vmovups 192(%rsp), %ymm9 175 vmovups 160(%rsp), %ymm10 176 vmovups 128(%rsp), %ymm11 177 vmovups 96(%rsp), %ymm12 178 vmovups 64(%rsp), %ymm13 179 vmovups 32(%rsp), %ymm14 180 vmovups (%rsp), %ymm15 181 vmovups 384(%rsp), %ymm0 182 movq 264(%rsp), %rsi 183 movq 256(%rsp), %rdi 184 movq 296(%rsp), %r12 185 cfi_restore (%r12) 186 movq 288(%rsp), %r13 187 cfi_restore (%r13) 188 movq 280(%rsp), %r14 189 cfi_restore (%r14) 190 movq 272(%rsp), %r15 191 cfi_restore (%r15) 192 jmp .LBL_1_2 193 194.LBL_1_10: 195 cfi_restore_state 196 movzbl %r12b, %r15d 197 vmovss 324(%rsp,%r15,8), %xmm0 198 vzeroupper 199 200 call JUMPTARGET(cosf) 201 202 vmovss %xmm0, 388(%rsp,%r15,8) 203 jmp .LBL_1_8 204 205.LBL_1_12: 206 movzbl %r12b, %r15d 207 vmovss 320(%rsp,%r15,8), %xmm0 208 vzeroupper 209 210 call JUMPTARGET(cosf) 211 212 vmovss %xmm0, 384(%rsp,%r15,8) 213 jmp .LBL_1_7 214 215END (_ZGVdN8v_cosf_avx2) 216