1 /* Wrapper implementations of vector math functions. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19 /* SSE2 ISA version as wrapper to scalar. */ 20 .macro WRAPPER_IMPL_SSE2 callee 21 subq $40, %rsp 22 cfi_adjust_cfa_offset(40) 23 movaps %xmm0, (%rsp) 24 call JUMPTARGET(\callee) 25 movss %xmm0, 16(%rsp) 26 movss 4(%rsp), %xmm0 27 call JUMPTARGET(\callee) 28 movss %xmm0, 20(%rsp) 29 movss 8(%rsp), %xmm0 30 call JUMPTARGET(\callee) 31 movss %xmm0, 24(%rsp) 32 movss 12(%rsp), %xmm0 33 call JUMPTARGET(\callee) 34 movss 16(%rsp), %xmm3 35 movss 20(%rsp), %xmm2 36 movss 24(%rsp), %xmm1 37 movss %xmm0, 28(%rsp) 38 unpcklps %xmm1, %xmm3 39 unpcklps %xmm0, %xmm2 40 unpcklps %xmm2, %xmm3 41 movaps %xmm3, %xmm0 42 addq $40, %rsp 43 cfi_adjust_cfa_offset(-40) 44 ret 45 .endm 46 47 /* 2 argument SSE2 ISA version as wrapper to scalar. */ 48 .macro WRAPPER_IMPL_SSE2_ff callee 49 subq $56, %rsp 50 cfi_adjust_cfa_offset(56) 51 movaps %xmm0, (%rsp) 52 movaps %xmm1, 16(%rsp) 53 call JUMPTARGET(\callee) 54 movss %xmm0, 32(%rsp) 55 movss 4(%rsp), %xmm0 56 movss 20(%rsp), %xmm1 57 call JUMPTARGET(\callee) 58 movss %xmm0, 36(%rsp) 59 movss 8(%rsp), %xmm0 60 movss 24(%rsp), %xmm1 61 call JUMPTARGET(\callee) 62 movss %xmm0, 40(%rsp) 63 movss 12(%rsp), %xmm0 64 movss 28(%rsp), %xmm1 65 call JUMPTARGET(\callee) 66 movss 32(%rsp), %xmm3 67 movss 36(%rsp), %xmm2 68 movss 40(%rsp), %xmm1 69 movss %xmm0, 44(%rsp) 70 unpcklps %xmm1, %xmm3 71 unpcklps %xmm0, %xmm2 72 unpcklps %xmm2, %xmm3 73 movaps %xmm3, %xmm0 74 addq $56, %rsp 75 cfi_adjust_cfa_offset(-56) 76 ret 77 .endm 78 79 /* 3 argument SSE2 ISA version as wrapper to scalar. */ 80 .macro WRAPPER_IMPL_SSE2_fFF callee 81 pushq %rbp 82 cfi_adjust_cfa_offset (8) 83 cfi_rel_offset (%rbp, 0) 84 pushq %rbx 85 cfi_adjust_cfa_offset (8) 86 cfi_rel_offset (%rbx, 0) 87 movq %rdi, %rbp 88 movq %rsi, %rbx 89 subq $40, %rsp 90 cfi_adjust_cfa_offset(40) 91 leaq 24(%rsp), %rsi 92 leaq 28(%rsp), %rdi 93 movaps %xmm0, (%rsp) 94 call JUMPTARGET(\callee) 95 leaq 24(%rsp), %rsi 96 leaq 28(%rsp), %rdi 97 movss 28(%rsp), %xmm0 98 movss %xmm0, 0(%rbp) 99 movaps (%rsp), %xmm1 100 movss 24(%rsp), %xmm0 101 movss %xmm0, (%rbx) 102 movaps %xmm1, %xmm0 103 shufps $85, %xmm1, %xmm0 104 call JUMPTARGET(\callee) 105 movss 28(%rsp), %xmm0 106 leaq 24(%rsp), %rsi 107 movss %xmm0, 4(%rbp) 108 leaq 28(%rsp), %rdi 109 movaps (%rsp), %xmm1 110 movss 24(%rsp), %xmm0 111 movss %xmm0, 4(%rbx) 112 movaps %xmm1, %xmm0 113 unpckhps %xmm1, %xmm0 114 call JUMPTARGET(\callee) 115 movaps (%rsp), %xmm1 116 leaq 24(%rsp), %rsi 117 leaq 28(%rsp), %rdi 118 movss 28(%rsp), %xmm0 119 shufps $255, %xmm1, %xmm1 120 movss %xmm0, 8(%rbp) 121 movss 24(%rsp), %xmm0 122 movss %xmm0, 8(%rbx) 123 movaps %xmm1, %xmm0 124 call JUMPTARGET(\callee) 125 movss 28(%rsp), %xmm0 126 movss %xmm0, 12(%rbp) 127 movss 24(%rsp), %xmm0 128 movss %xmm0, 12(%rbx) 129 addq $40, %rsp 130 cfi_adjust_cfa_offset(-40) 131 popq %rbx 132 cfi_adjust_cfa_offset (-8) 133 cfi_restore (%rbx) 134 popq %rbp 135 cfi_adjust_cfa_offset (-8) 136 cfi_restore (%rbp) 137 ret 138 .endm 139 140 /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ 141 .macro WRAPPER_IMPL_AVX callee 142 pushq %rbp 143 cfi_adjust_cfa_offset (8) 144 cfi_rel_offset (%rbp, 0) 145 movq %rsp, %rbp 146 cfi_def_cfa_register (%rbp) 147 andq $-32, %rsp 148 subq $32, %rsp 149 vextractf128 $1, %ymm0, (%rsp) 150 vzeroupper 151 call HIDDEN_JUMPTARGET(\callee) 152 vmovaps %xmm0, 16(%rsp) 153 vmovaps (%rsp), %xmm0 154 call HIDDEN_JUMPTARGET(\callee) 155 vmovaps %xmm0, %xmm1 156 vmovaps 16(%rsp), %xmm0 157 vinsertf128 $1, %xmm1, %ymm0, %ymm0 158 movq %rbp, %rsp 159 cfi_def_cfa_register (%rsp) 160 popq %rbp 161 cfi_adjust_cfa_offset (-8) 162 cfi_restore (%rbp) 163 ret 164 .endm 165 166 /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ 167 .macro WRAPPER_IMPL_AVX_ff callee 168 pushq %rbp 169 cfi_adjust_cfa_offset (8) 170 cfi_rel_offset (%rbp, 0) 171 movq %rsp, %rbp 172 cfi_def_cfa_register (%rbp) 173 andq $-32, %rsp 174 subq $64, %rsp 175 vextractf128 $1, %ymm0, 16(%rsp) 176 vextractf128 $1, %ymm1, (%rsp) 177 vzeroupper 178 call HIDDEN_JUMPTARGET(\callee) 179 vmovaps %xmm0, 32(%rsp) 180 vmovaps 16(%rsp), %xmm0 181 vmovaps (%rsp), %xmm1 182 call HIDDEN_JUMPTARGET(\callee) 183 vmovaps %xmm0, %xmm1 184 vmovaps 32(%rsp), %xmm0 185 vinsertf128 $1, %xmm1, %ymm0, %ymm0 186 movq %rbp, %rsp 187 cfi_def_cfa_register (%rsp) 188 popq %rbp 189 cfi_adjust_cfa_offset (-8) 190 cfi_restore (%rbp) 191 ret 192 .endm 193 194 /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ 195 .macro WRAPPER_IMPL_AVX_fFF callee 196 pushq %rbp 197 cfi_adjust_cfa_offset (8) 198 cfi_rel_offset (%rbp, 0) 199 movq %rsp, %rbp 200 cfi_def_cfa_register (%rbp) 201 andq $-32, %rsp 202 pushq %r13 203 cfi_adjust_cfa_offset (8) 204 cfi_rel_offset (%r13, 0) 205 pushq %r14 206 cfi_adjust_cfa_offset (8) 207 cfi_rel_offset (%r14, 0) 208 subq $48, %rsp 209 movq %rsi, %r14 210 vmovaps %ymm0, (%rsp) 211 movq %rdi, %r13 212 vmovaps 16(%rsp), %xmm1 213 vmovaps %xmm1, 32(%rsp) 214 vzeroupper 215 vmovaps (%rsp), %xmm0 216 call HIDDEN_JUMPTARGET(\callee) 217 vmovaps 32(%rsp), %xmm0 218 lea (%rsp), %rdi 219 lea 16(%rsp), %rsi 220 call HIDDEN_JUMPTARGET(\callee) 221 vmovaps (%rsp), %xmm0 222 vmovaps 16(%rsp), %xmm1 223 vmovaps %xmm0, 16(%r13) 224 vmovaps %xmm1, 16(%r14) 225 addq $48, %rsp 226 popq %r14 227 cfi_adjust_cfa_offset (-8) 228 cfi_restore (%r14) 229 popq %r13 230 cfi_adjust_cfa_offset (-8) 231 cfi_restore (%r13) 232 movq %rbp, %rsp 233 cfi_def_cfa_register (%rsp) 234 popq %rbp 235 cfi_adjust_cfa_offset (-8) 236 cfi_restore (%rbp) 237 ret 238 .endm 239 240 /* AVX512 ISA version as wrapper to AVX2 ISA version. */ 241 .macro WRAPPER_IMPL_AVX512 callee 242 pushq %rbp 243 cfi_adjust_cfa_offset (8) 244 cfi_rel_offset (%rbp, 0) 245 movq %rsp, %rbp 246 cfi_def_cfa_register (%rbp) 247 andq $-64, %rsp 248 subq $128, %rsp 249 vmovups %zmm0, (%rsp) 250 vmovupd (%rsp), %ymm0 251 call HIDDEN_JUMPTARGET(\callee) 252 vmovupd %ymm0, 64(%rsp) 253 vmovupd 32(%rsp), %ymm0 254 call HIDDEN_JUMPTARGET(\callee) 255 vmovupd %ymm0, 96(%rsp) 256 vmovups 64(%rsp), %zmm0 257 movq %rbp, %rsp 258 cfi_def_cfa_register (%rsp) 259 popq %rbp 260 cfi_adjust_cfa_offset (-8) 261 cfi_restore (%rbp) 262 ret 263 .endm 264 265 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ 266 .macro WRAPPER_IMPL_AVX512_ff callee 267 pushq %rbp 268 cfi_adjust_cfa_offset (8) 269 cfi_rel_offset (%rbp, 0) 270 movq %rsp, %rbp 271 cfi_def_cfa_register (%rbp) 272 andq $-64, %rsp 273 subq $192, %rsp 274 vmovups %zmm0, (%rsp) 275 vmovups %zmm1, 64(%rsp) 276 vmovups (%rsp), %ymm0 277 vmovups 64(%rsp), %ymm1 278 call HIDDEN_JUMPTARGET(\callee) 279 vmovups %ymm0, 128(%rsp) 280 vmovups 32(%rsp), %ymm0 281 vmovups 96(%rsp), %ymm1 282 call HIDDEN_JUMPTARGET(\callee) 283 vmovups %ymm0, 160(%rsp) 284 vmovups 128(%rsp), %zmm0 285 movq %rbp, %rsp 286 cfi_def_cfa_register (%rsp) 287 popq %rbp 288 cfi_adjust_cfa_offset (-8) 289 cfi_restore (%rbp) 290 ret 291 .endm 292 293 /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ 294 .macro WRAPPER_IMPL_AVX512_fFF callee 295 pushq %rbp 296 cfi_adjust_cfa_offset (8) 297 cfi_rel_offset (%rbp, 0) 298 movq %rsp, %rbp 299 cfi_def_cfa_register (%rbp) 300 andq $-64, %rsp 301 pushq %r12 302 pushq %r13 303 subq $176, %rsp 304 movq %rsi, %r13 305 vmovaps %zmm0, (%rsp) 306 movq %rdi, %r12 307 vmovaps (%rsp), %ymm0 308 call HIDDEN_JUMPTARGET(\callee) 309 vmovaps 32(%rsp), %ymm0 310 lea 64(%rsp), %rdi 311 lea 96(%rsp), %rsi 312 call HIDDEN_JUMPTARGET(\callee) 313 vmovaps 64(%rsp), %ymm0 314 vmovaps 96(%rsp), %ymm1 315 vmovaps %ymm0, 32(%r12) 316 vmovaps %ymm1, 32(%r13) 317 addq $176, %rsp 318 popq %r13 319 popq %r12 320 movq %rbp, %rsp 321 cfi_def_cfa_register (%rsp) 322 popq %rbp 323 cfi_adjust_cfa_offset (-8) 324 cfi_restore (%rbp) 325 ret 326 .endm 327