1/* Function atanf vectorized with AVX2. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 https://www.gnu.org/licenses/. */ 18 19/* 20 * ALGORITHM DESCRIPTION: 21 * 22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) 23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) 24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) 25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) 26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x 27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. 28 * 29 */ 30 31/* Offsets for data table __svml_satan_data_internal 32 */ 33#define _sSIGN_MASK 0 34#define _sABS_MASK 32 35#define _sONE 64 36#define _sPIO2 96 37#define _sPC8 128 38#define _sPC7 160 39#define _sPC6 192 40#define _sPC5 224 41#define _sPC4 256 42#define _sPC3 288 43#define _sPC2 320 44#define _sPC1 352 45#define _sPC0 384 46 47#include <sysdep.h> 48 49 .section .text.avx2, "ax", @progbits 50ENTRY(_ZGVdN8v_atanf_avx2) 51 /* 52 * 1) If x>1, then r=-1/x, PIO2=Pi/2 53 * 2) If -1<=x<=1, then r=x, PIO2=0 54 * 3) If x<-1, then r=-1/x, PIO2=-Pi/2 55 */ 56 vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2 57 vmovups __svml_satan_data_internal(%rip), %ymm7 58 vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13 59 60 /* 61 * To use minps\maxps operations for argument reduction 62 * uncomment _AT_USEMINMAX_ definition 63 * Declarations 64 * Variables 65 * Constants 66 */ 67 vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3 68 vmaxps %ymm3, %ymm2, %ymm5 69 vminps %ymm3, %ymm2, %ymm4 70 vcmple_oqps %ymm2, %ymm3, %ymm6 71 vdivps %ymm5, %ymm4, %ymm11 72 vandps %ymm7, %ymm0, %ymm9 73 vandnps %ymm7, %ymm6, %ymm8 74 vxorps %ymm9, %ymm8, %ymm10 75 vxorps %ymm11, %ymm10, %ymm15 76 77 /* Polynomial. */ 78 vmulps %ymm15, %ymm15, %ymm14 79 vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0 80 vmulps %ymm14, %ymm14, %ymm12 81 vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0 82 vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13 83 vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0 84 vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13 85 vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0 86 vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13 87 vfmadd213ps %ymm13, %ymm14, %ymm0 88 vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0 89 vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1 90 vxorps %ymm9, %ymm1, %ymm1 91 92 /* Reconstruction. */ 93 vfmadd213ps %ymm1, %ymm15, %ymm0 94 ret 95 96END(_ZGVdN8v_atanf_avx2) 97 98 .section .rodata, "a" 99 .align 32 100 101#ifdef __svml_satan_data_internal_typedef 102typedef unsigned int VUINT32; 103typedef struct { 104 __declspec(align(32)) VUINT32 _sSIGN_MASK[8][1]; 105 __declspec(align(32)) VUINT32 _sABS_MASK[8][1]; 106 __declspec(align(32)) VUINT32 _sONE[8][1]; 107 __declspec(align(32)) VUINT32 _sPIO2[8][1]; 108 __declspec(align(32)) VUINT32 _sPC8[8][1]; 109 __declspec(align(32)) VUINT32 _sPC7[8][1]; 110 __declspec(align(32)) VUINT32 _sPC6[8][1]; 111 __declspec(align(32)) VUINT32 _sPC5[8][1]; 112 __declspec(align(32)) VUINT32 _sPC4[8][1]; 113 __declspec(align(32)) VUINT32 _sPC3[8][1]; 114 __declspec(align(32)) VUINT32 _sPC2[8][1]; 115 __declspec(align(32)) VUINT32 _sPC1[8][1]; 116 __declspec(align(32)) VUINT32 _sPC0[8][1]; 117} __svml_satan_data_internal; 118#endif 119__svml_satan_data_internal: 120 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK 121 .align 32 122 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK 123 .align 32 124 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE 125 .align 32 126 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2 127 .align 32 128 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8 129 .align 32 130 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7 131 .align 32 132 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6 133 .align 32 134 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5 135 .align 32 136 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4 137 .align 32 138 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3 139 .align 32 140 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2 141 .align 32 142 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1 143 .align 32 144 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0 145 .align 32 146 .type __svml_satan_data_internal, @object 147 .size __svml_satan_data_internal, .-__svml_satan_data_internal 148