1/* Function powf vectorized with AVX-512. KNL and SKX versions.
2   Copyright (C) 2014-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "svml_s_powf_data.h"
21#include "svml_s_wrapper_impl.h"
22
23/*
24   ALGORITHM DESCRIPTION:
25
26     We are using the next identity : pow(x,y) = 2^(y * log2(x)).
27
28     1) log2(x) calculation
29        Here we use the following formula.
30        Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
31        Let C ~= 1/ln(2),
32        Rcp1 ~= 1/X1,   X2=Rcp1*X1,
33        Rcp2 ~= 1/X2,   X3=Rcp2*X2,
34        Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
35        Then
36          log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
37                    log2(X1*Rcp1*Rcp2*Rcp3C/C),
38        where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
39
40        The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
41        Rcp3C, log2(C/Rcp3C) are taken from tables.
42        Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
43        is exactly represented in target precision.
44
45        log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
46             = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
47             = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
48             = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
49        where
50             cq=X1*Rcp1*Rcp2*Rcp3C-C,
51             a1=1/(C*ln(2))-1 is small,
52             a2=1/(2*C^2*ln2),
53             a3=1/(3*C^3*ln2),
54                  ...
55        Log2 result is split by three parts: HH+HL+HLL
56
57     2) Calculation of y*log2(x)
58        Split y into YHi+YLo.
59        Get high PH and medium PL parts of y*log2|x|.
60        Get low PLL part of y*log2|x|.
61        Now we have PH+PL+PLL ~= y*log2|x|.
62
63     3) Calculation of 2^(y*log2(x))
64        Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
65        where expK=7 in this implementation, N and j are integers,
66        0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
67        2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
68        where 2^(j/2^expK) is stored in a table, and
69        2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
70        We compute 2^(PH+PL+PLL) as follows:
71        Break PH into PHH + PHL, where PHH = N + j/2^expK.
72        Z = PHL + PL + PLL
73        Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
74        Get 2^(j/2^expK) from table in the form THI+TLO.
75        Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
76        Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
77        ResHi := THI
78        ResLo := THI * Exp2Poly + TLO
79        Get exponent ERes of the result:
80        Res := ResHi + ResLo:
81        Result := ex(Res) + N.  */
82
83	.text
84ENTRY (_ZGVeN16vv_powf_knl)
85        pushq     %rbp
86        cfi_adjust_cfa_offset (8)
87        cfi_rel_offset (%rbp, 0)
88        movq      %rsp, %rbp
89        cfi_def_cfa_register (%rbp)
90        andq      $-64, %rsp
91        subq      $1344, %rsp
92        movq      __svml_spow_data@GOTPCREL(%rip), %rdx
93        vmovaps   %zmm1, %zmm9
94        vshuff32x4 $238, %zmm0, %zmm0, %zmm7
95        kxnorw    %k3, %k3, %k3
96        vcvtps2pd %ymm0, %zmm14
97        vcvtps2pd %ymm7, %zmm10
98        movl      $-1, %eax
99        movq      $-1, %rcx
100        vpandd    _ABSMASK(%rdx), %zmm9, %zmm4
101        vmovups   _ExpMask(%rdx), %zmm6
102
103/* exponent bits selection */
104        vpsrlq    $20, %zmm14, %zmm13
105        vshuff32x4 $238, %zmm9, %zmm9, %zmm8
106        vpcmpd    $5, _INF(%rdx), %zmm4, %k2
107        vpsrlq    $32, %zmm13, %zmm15
108        vcvtps2pd %ymm8, %zmm2
109        vmovups   _Two10(%rdx), %zmm4
110        vpmovqd   %zmm15, %ymm12
111        vcvtps2pd %ymm9, %zmm1
112        vpsubd    _NMINNORM(%rdx), %zmm0, %zmm3
113        vpbroadcastd %eax, %zmm8{%k2}{z}
114        vpcmpd    $5, _NMAXVAL(%rdx), %zmm3, %k1
115
116/* preserve mantissa, set input exponent to 2^(-10) */
117        vmovaps   %zmm6, %zmm3
118        vpternlogq $248, %zmm6, %zmm10, %zmm4
119        vpsrlq    $20, %zmm10, %zmm10
120        vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3
121
122/* reciprocal approximation good to at least 11 bits */
123        vrcp28pd  %zmm4, %zmm11
124        vpsrlq    $32, %zmm10, %zmm14
125        vpbroadcastd %eax, %zmm7{%k1}{z}
126        kxnorw    %k1, %k1, %k1
127        vrcp28pd  %zmm3, %zmm5
128        vpmovqd   %zmm14, %ymm6
129        vshufi32x4 $68, %zmm6, %zmm12, %zmm13
130        vmovups   _One(%rdx), %zmm6
131
132/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
133        vrndscalepd $8, %zmm5, %zmm14
134
135/* biased exponent in DP format */
136        vshuff32x4 $238, %zmm13, %zmm13, %zmm5
137        vrndscalepd $8, %zmm11, %zmm11
138        vcmppd    $30, _Threshold(%rdx), %zmm14, %k2
139        vcvtdq2pd %ymm13, %zmm10
140        vcvtdq2pd %ymm5, %zmm15
141
142/* table lookup */
143        vpsrlq    $40, %zmm14, %zmm13
144        vpxord    %zmm5, %zmm5, %zmm5
145        vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3}
146        vfmsub213pd %zmm6, %zmm14, %zmm3
147        vfmsub213pd %zmm6, %zmm11, %zmm4
148        vcmppd    $30, _Threshold(%rdx), %zmm11, %k3
149        vpbroadcastq %rcx, %zmm14{%k2}{z}
150
151/* dpP= _dbT+lJ*T_ITEM_GRAN */
152        kxnorw    %k2, %k2, %k2
153        vpsrlq    $40, %zmm11, %zmm12
154        vpxord    %zmm6, %zmm6, %zmm6
155        vpbroadcastq %rcx, %zmm11{%k3}{z}
156        kxnorw    %k3, %k3, %k3
157        vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1}
158        vmovups   _Bias1(%rdx), %zmm12
159        vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14
160        vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12
161        vsubpd    %zmm14, %zmm10, %zmm13
162        vsubpd    %zmm12, %zmm15, %zmm10
163        vmovups   _poly_coeff_3(%rdx), %zmm11
164        vmovups   _poly_coeff_4(%rdx), %zmm15
165        vfmadd213pd %zmm15, %zmm4, %zmm11
166        vmulpd    %zmm4, %zmm4, %zmm12
167        vmovaps   %zmm15, %zmm14
168        vmulpd    %zmm3, %zmm3, %zmm15
169        vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14
170
171/* reconstruction */
172        vfmadd213pd %zmm4, %zmm12, %zmm11
173        vfmadd213pd %zmm3, %zmm15, %zmm14
174        vaddpd    %zmm6, %zmm11, %zmm11
175        vaddpd    %zmm5, %zmm14, %zmm3
176        vfmadd231pd _L2(%rdx), %zmm10, %zmm11
177        vfmadd132pd _L2(%rdx), %zmm3, %zmm13
178        vmulpd    %zmm2, %zmm11, %zmm12
179        vmulpd    %zmm1, %zmm13, %zmm10
180        vmulpd    __dbInvLn2(%rdx), %zmm12, %zmm6
181
182/* hi bits */
183        vpsrlq    $32, %zmm12, %zmm12
184        vmulpd    __dbInvLn2(%rdx), %zmm10, %zmm1
185
186/* to round down; if dR is an integer we will get R = 1, which is ok */
187        vsubpd    __dbHALF(%rdx), %zmm6, %zmm4
188        vpsrlq    $32, %zmm10, %zmm11
189        vpmovqd   %zmm11, %ymm3
190        vsubpd    __dbHALF(%rdx), %zmm1, %zmm2
191        vaddpd    __dbShifter(%rdx), %zmm4, %zmm14
192        vpmovqd   %zmm12, %ymm4
193        vshufi32x4 $68, %zmm4, %zmm3, %zmm5
194        vpxord    %zmm4, %zmm4, %zmm4
195        vaddpd    __dbShifter(%rdx), %zmm2, %zmm2
196
197/* iAbsX = iAbsX&iAbsMask; */
198        vpandd    __iAbsMask(%rdx), %zmm5, %zmm11
199        vpxord    %zmm5, %zmm5, %zmm5
200        vsubpd    __dbShifter(%rdx), %zmm14, %zmm13
201
202/* iRangeMask = (iAbsX>iDomainRange) */
203        vpcmpgtd     __iDomainRange(%rdx), %zmm11, %k1
204        vsubpd       __dbShifter(%rdx), %zmm2, %zmm15
205        vpbroadcastd %eax, %zmm10{%k1}{z}
206        vpternlogd   $254, %zmm8, %zmm7, %zmm10
207
208/* [0..1) */
209        vsubpd    %zmm15, %zmm1, %zmm1
210
211/* low K bits */
212        vpandq    __lbLOWKBITS(%rdx), %zmm14, %zmm11
213        vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3}
214        vsubpd    %zmm13, %zmm6, %zmm7
215        vptestmd  %zmm10, %zmm10, %k0
216        vpandq    __lbLOWKBITS(%rdx), %zmm2, %zmm10
217        vmulpd    __dbC1(%rdx), %zmm1, %zmm1
218        vmulpd    __dbC1(%rdx), %zmm7, %zmm3
219        vpsrlq    $11, %zmm2, %zmm8
220        vpsrlq    $11, %zmm14, %zmm2
221
222/* NB : including +/- sign for the exponent!! */
223        vpsllq    $52, %zmm8, %zmm8
224        kmovw     %k0, %ecx
225        vpsllq    $52, %zmm2, %zmm6
226        vfmadd213pd %zmm5, %zmm3, %zmm5
227        vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2}
228        vfmadd213pd %zmm4, %zmm1, %zmm4
229        vpaddq    %zmm6, %zmm5, %zmm10
230        vcvtpd2ps %zmm10, %ymm12
231        vpaddq    %zmm8, %zmm4, %zmm7
232        vcvtpd2ps %zmm7, %ymm11
233        vshuff32x4 $68, %zmm12, %zmm11, %zmm1
234        testl     %ecx, %ecx
235        jne       .LBL_1_3
236
237.LBL_1_2:
238        cfi_remember_state
239        vmovaps   %zmm1, %zmm0
240        movq      %rbp, %rsp
241        cfi_def_cfa_register (%rsp)
242        popq      %rbp
243        cfi_adjust_cfa_offset (-8)
244        cfi_restore (%rbp)
245        ret
246
247.LBL_1_3:
248        cfi_restore_state
249        vmovups   %zmm0, 1152(%rsp)
250        vmovups   %zmm9, 1216(%rsp)
251        vmovups   %zmm1, 1280(%rsp)
252        je        .LBL_1_2
253
254        xorb      %dl, %dl
255        kmovw     %k4, 1048(%rsp)
256        xorl      %eax, %eax
257        kmovw     %k5, 1040(%rsp)
258        kmovw     %k6, 1032(%rsp)
259        kmovw     %k7, 1024(%rsp)
260        vmovups   %zmm16, 960(%rsp)
261        vmovups   %zmm17, 896(%rsp)
262        vmovups   %zmm18, 832(%rsp)
263        vmovups   %zmm19, 768(%rsp)
264        vmovups   %zmm20, 704(%rsp)
265        vmovups   %zmm21, 640(%rsp)
266        vmovups   %zmm22, 576(%rsp)
267        vmovups   %zmm23, 512(%rsp)
268        vmovups   %zmm24, 448(%rsp)
269        vmovups   %zmm25, 384(%rsp)
270        vmovups   %zmm26, 320(%rsp)
271        vmovups   %zmm27, 256(%rsp)
272        vmovups   %zmm28, 192(%rsp)
273        vmovups   %zmm29, 128(%rsp)
274        vmovups   %zmm30, 64(%rsp)
275        vmovups   %zmm31, (%rsp)
276        movq      %rsi, 1064(%rsp)
277        movq      %rdi, 1056(%rsp)
278        movq      %r12, 1096(%rsp)
279        cfi_offset_rel_rsp (12, 1096)
280        movb      %dl, %r12b
281        movq      %r13, 1088(%rsp)
282        cfi_offset_rel_rsp (13, 1088)
283        movl      %ecx, %r13d
284        movq      %r14, 1080(%rsp)
285        cfi_offset_rel_rsp (14, 1080)
286        movl      %eax, %r14d
287        movq      %r15, 1072(%rsp)
288        cfi_offset_rel_rsp (15, 1072)
289        cfi_remember_state
290
291.LBL_1_6:
292        btl       %r14d, %r13d
293        jc        .LBL_1_12
294
295.LBL_1_7:
296        lea       1(%r14), %esi
297        btl       %esi, %r13d
298        jc        .LBL_1_10
299
300.LBL_1_8:
301        addb      $1, %r12b
302        addl      $2, %r14d
303        cmpb      $16, %r12b
304        jb        .LBL_1_6
305
306        kmovw     1048(%rsp), %k4
307        movq      1064(%rsp), %rsi
308        kmovw     1040(%rsp), %k5
309        movq      1056(%rsp), %rdi
310        kmovw     1032(%rsp), %k6
311        movq      1096(%rsp), %r12
312        cfi_restore (%r12)
313        movq      1088(%rsp), %r13
314        cfi_restore (%r13)
315        kmovw     1024(%rsp), %k7
316        vmovups   960(%rsp), %zmm16
317        vmovups   896(%rsp), %zmm17
318        vmovups   832(%rsp), %zmm18
319        vmovups   768(%rsp), %zmm19
320        vmovups   704(%rsp), %zmm20
321        vmovups   640(%rsp), %zmm21
322        vmovups   576(%rsp), %zmm22
323        vmovups   512(%rsp), %zmm23
324        vmovups   448(%rsp), %zmm24
325        vmovups   384(%rsp), %zmm25
326        vmovups   320(%rsp), %zmm26
327        vmovups   256(%rsp), %zmm27
328        vmovups   192(%rsp), %zmm28
329        vmovups   128(%rsp), %zmm29
330        vmovups   64(%rsp), %zmm30
331        vmovups   (%rsp), %zmm31
332        movq      1080(%rsp), %r14
333        cfi_restore (%r14)
334        movq      1072(%rsp), %r15
335        cfi_restore (%r15)
336        vmovups   1280(%rsp), %zmm1
337        jmp       .LBL_1_2
338
339.LBL_1_10:
340        cfi_restore_state
341        movzbl    %r12b, %r15d
342        vmovss    1156(%rsp,%r15,8), %xmm0
343        vmovss    1220(%rsp,%r15,8), %xmm1
344        call      JUMPTARGET(powf)
345        vmovss    %xmm0, 1284(%rsp,%r15,8)
346        jmp       .LBL_1_8
347
348.LBL_1_12:
349        movzbl    %r12b, %r15d
350        vmovss    1152(%rsp,%r15,8), %xmm0
351        vmovss    1216(%rsp,%r15,8), %xmm1
352        call      JUMPTARGET(powf)
353        vmovss    %xmm0, 1280(%rsp,%r15,8)
354        jmp       .LBL_1_7
355END (_ZGVeN16vv_powf_knl)
356
357ENTRY (_ZGVeN16vv_powf_skx)
358        pushq     %rbp
359        cfi_adjust_cfa_offset (8)
360        cfi_rel_offset (%rbp, 0)
361        movq      %rsp, %rbp
362        cfi_def_cfa_register (%rbp)
363        andq      $-64, %rsp
364        subq      $1344, %rsp
365        movq      __svml_spow_data@GOTPCREL(%rip), %rax
366        vextractf32x8 $1, %zmm1, %ymm14
367        vextractf32x8 $1, %zmm0, %ymm15
368        vpsubd _NMINNORM(%rax), %zmm0, %zmm9
369        vmovups   %zmm26, 1280(%rsp)
370        vmovups _ExpMask(%rax), %zmm6
371        vpcmpd    $1, _NMAXVAL(%rax), %zmm9, %k1
372        vcvtps2pd %ymm0, %zmm5
373        vcvtps2pd %ymm1, %zmm12
374        kxnorw    %k3, %k3, %k3
375
376/* exponent bits selection */
377        vpsrlq    $20, %zmm5, %zmm3
378        vpsrlq    $32, %zmm3, %zmm2
379        vpmovqd   %zmm2, %ymm11
380        vcvtps2pd %ymm14, %zmm13
381        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
382        vmovaps   %zmm14, %zmm26
383        vpandd _ABSMASK(%rax), %zmm1, %zmm8
384        vpcmpd    $1, _INF(%rax), %zmm8, %k2
385        vpandnd   %zmm9, %zmm9, %zmm26{%k1}
386        vmovups _Two10(%rax), %zmm9
387        kxnorw    %k1, %k1, %k1
388        vcvtps2pd %ymm15, %zmm4
389        vmovaps   %zmm14, %zmm15
390
391/* preserve mantissa, set input exponent to 2^(-10) */
392        vpternlogq $248, %zmm6, %zmm4, %zmm9
393        vpsrlq    $20, %zmm4, %zmm4
394
395/* reciprocal approximation good to at least 11 bits */
396        vrcp14pd  %zmm9, %zmm10
397
398/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
399        vrndscalepd $8, %zmm10, %zmm3
400        vmovups _One(%rax), %zmm10
401        vfmsub213pd %zmm10, %zmm3, %zmm9
402        vpandnd   %zmm8, %zmm8, %zmm15{%k2}
403        vmovaps   %zmm6, %zmm8
404        vpternlogq $234, _Two10(%rax), %zmm5, %zmm8
405        vpsrlq    $32, %zmm4, %zmm5
406        vrcp14pd  %zmm8, %zmm7
407        vpmovqd   %zmm5, %ymm6
408        vrndscalepd $8, %zmm7, %zmm2
409        vfmsub213pd %zmm10, %zmm2, %zmm8
410
411/* table lookup */
412        vpsrlq    $40, %zmm2, %zmm10
413        vinserti32x8 $1, %ymm6, %zmm11, %zmm4
414        vpsrlq    $40, %zmm3, %zmm11
415
416/* biased exponent in DP format */
417        vextracti32x8 $1, %zmm4, %ymm7
418        vcvtdq2pd %ymm4, %zmm6
419        vpmovqd   %zmm10, %ymm4
420        vpmovqd   %zmm11, %ymm5
421        vpxord    %zmm10, %zmm10, %zmm10
422        vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
423        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
424        vpxord    %zmm11, %zmm11, %zmm11
425        vcvtdq2pd %ymm7, %zmm7
426        vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
427        vmovups _Threshold(%rax), %zmm5
428        vcmppd    $21, %zmm2, %zmm5, %k2
429        vcmppd    $21, %zmm3, %zmm5, %k3
430        vmovups _Bias1(%rax), %zmm3
431        vmovaps   %zmm4, %zmm2
432        vpandnq   %zmm5, %zmm5, %zmm2{%k2}
433        vpternlogq $236, _Bias(%rax), %zmm3, %zmm2
434
435/* dpP= _dbT+lJ*T_ITEM_GRAN */
436        kxnorw    %k2, %k2, %k2
437        vpandnq   %zmm5, %zmm5, %zmm4{%k3}
438        vpternlogq $248, _Bias(%rax), %zmm4, %zmm3
439        vsubpd    %zmm2, %zmm6, %zmm4
440        vmovups _poly_coeff_3(%rax), %zmm6
441        vmovups _poly_coeff_4(%rax), %zmm2
442        vsubpd    %zmm3, %zmm7, %zmm5
443        vmulpd    %zmm8, %zmm8, %zmm7
444        vfmadd213pd %zmm2, %zmm9, %zmm6
445        kxnorw    %k3, %k3, %k3
446        vmovaps   %zmm2, %zmm3
447        vmulpd    %zmm9, %zmm9, %zmm2
448        vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3
449
450/* reconstruction */
451        vfmadd213pd %zmm9, %zmm2, %zmm6
452        vfmadd213pd %zmm8, %zmm7, %zmm3
453        vaddpd    %zmm11, %zmm6, %zmm8
454        vaddpd    %zmm10, %zmm3, %zmm9
455        vfmadd231pd _L2(%rax), %zmm5, %zmm8
456        vfmadd132pd _L2(%rax), %zmm9, %zmm4
457        vmulpd    %zmm13, %zmm8, %zmm13
458        vmulpd    %zmm12, %zmm4, %zmm3
459        vmulpd __dbInvLn2(%rax), %zmm13, %zmm10
460        vmulpd __dbInvLn2(%rax), %zmm3, %zmm8
461
462/* hi bits */
463        vpsrlq    $32, %zmm3, %zmm4
464        vpsrlq    $32, %zmm13, %zmm13
465
466/* to round down; if dR is an integer we will get R = 1, which is ok */
467        vsubpd __dbHALF(%rax), %zmm8, %zmm12
468        vpmovqd   %zmm4, %ymm5
469        vpmovqd   %zmm13, %ymm2
470        vsubpd __dbHALF(%rax), %zmm10, %zmm9
471        vaddpd __dbShifter(%rax), %zmm12, %zmm7
472        vaddpd __dbShifter(%rax), %zmm9, %zmm9
473        vsubpd __dbShifter(%rax), %zmm7, %zmm11
474        vsubpd __dbShifter(%rax), %zmm9, %zmm12
475        vinserti32x8 $1, %ymm2, %zmm5, %zmm3
476
477/* iAbsX = iAbsX&iAbsMask */
478        vpandd __iAbsMask(%rax), %zmm3, %zmm4
479
480/* iRangeMask = (iAbsX>iDomainRange) */
481        vpcmpd    $2, __iDomainRange(%rax), %zmm4, %k1
482        vpandnd   %zmm4, %zmm4, %zmm14{%k1}
483        vpternlogd $254, %zmm15, %zmm26, %zmm14
484
485/* [0..1) */
486        vsubpd    %zmm11, %zmm8, %zmm15
487        vsubpd    %zmm12, %zmm10, %zmm26
488        vptestmd  %zmm14, %zmm14, %k0
489        vpsrlq    $11, %zmm7, %zmm8
490        vpsrlq    $11, %zmm9, %zmm10
491        vmulpd __dbC1(%rax), %zmm26, %zmm26
492        vmulpd __dbC1(%rax), %zmm15, %zmm15
493
494/* NB : including +/- sign for the exponent!! */
495        vpsllq    $52, %zmm10, %zmm13
496        vpsllq    $52, %zmm8, %zmm12
497        kmovw     %k0, %ecx
498
499/* low K bits */
500        vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14
501        vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6
502        vpmovqd   %zmm14, %ymm7
503        vpmovqd   %zmm6, %ymm9
504        vpxord    %zmm2, %zmm2, %zmm2
505        vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3}
506        vfmadd213pd %zmm2, %zmm26, %zmm2
507        vpaddq    %zmm13, %zmm2, %zmm2
508        vcvtpd2ps %zmm2, %ymm4
509        vpxord    %zmm11, %zmm11, %zmm11
510        vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2}
511        vfmadd213pd %zmm11, %zmm15, %zmm11
512        vpaddq    %zmm12, %zmm11, %zmm3
513        vcvtpd2ps %zmm3, %ymm5
514        vinsertf32x8 $1, %ymm4, %zmm5, %zmm2
515        testl     %ecx, %ecx
516        jne       .LBL_2_3
517
518.LBL_2_2:
519        cfi_remember_state
520        vmovups   1280(%rsp), %zmm26
521        vmovaps   %zmm2, %zmm0
522        movq      %rbp, %rsp
523        cfi_def_cfa_register (%rsp)
524        popq      %rbp
525        cfi_adjust_cfa_offset (-8)
526        cfi_restore (%rbp)
527        ret
528
529.LBL_2_3:
530        cfi_restore_state
531        vmovups   %zmm0, 1088(%rsp)
532        vmovups   %zmm1, 1152(%rsp)
533        vmovups   %zmm2, 1216(%rsp)
534        je        .LBL_2_2
535
536        xorb      %dl, %dl
537        xorl      %eax, %eax
538        kmovw     %k4, 984(%rsp)
539        kmovw     %k5, 976(%rsp)
540        kmovw     %k6, 968(%rsp)
541        kmovw     %k7, 960(%rsp)
542        vmovups   %zmm16, 896(%rsp)
543        vmovups   %zmm17, 832(%rsp)
544        vmovups   %zmm18, 768(%rsp)
545        vmovups   %zmm19, 704(%rsp)
546        vmovups   %zmm20, 640(%rsp)
547        vmovups   %zmm21, 576(%rsp)
548        vmovups   %zmm22, 512(%rsp)
549        vmovups   %zmm23, 448(%rsp)
550        vmovups   %zmm24, 384(%rsp)
551        vmovups   %zmm25, 320(%rsp)
552        vmovups   %zmm27, 256(%rsp)
553        vmovups   %zmm28, 192(%rsp)
554        vmovups   %zmm29, 128(%rsp)
555        vmovups   %zmm30, 64(%rsp)
556        vmovups   %zmm31, (%rsp)
557        movq      %rsi, 1000(%rsp)
558        movq      %rdi, 992(%rsp)
559        movq      %r12, 1032(%rsp)
560        cfi_offset_rel_rsp (12, 1032)
561        movb      %dl, %r12b
562        movq      %r13, 1024(%rsp)
563        cfi_offset_rel_rsp (13, 1024)
564        movl      %ecx, %r13d
565        movq      %r14, 1016(%rsp)
566        cfi_offset_rel_rsp (14, 1016)
567        movl      %eax, %r14d
568        movq      %r15, 1008(%rsp)
569        cfi_offset_rel_rsp (15, 1008)
570        cfi_remember_state
571
572.LBL_2_6:
573        btl       %r14d, %r13d
574        jc        .LBL_2_12
575
576.LBL_2_7:
577        lea       1(%r14), %esi
578        btl       %esi, %r13d
579        jc        .LBL_2_10
580
581.LBL_2_8:
582        incb      %r12b
583        addl      $2, %r14d
584        cmpb      $16, %r12b
585        jb        .LBL_2_6
586
587        kmovw     984(%rsp), %k4
588        kmovw     976(%rsp), %k5
589        kmovw     968(%rsp), %k6
590        kmovw     960(%rsp), %k7
591        vmovups   896(%rsp), %zmm16
592        vmovups   832(%rsp), %zmm17
593        vmovups   768(%rsp), %zmm18
594        vmovups   704(%rsp), %zmm19
595        vmovups   640(%rsp), %zmm20
596        vmovups   576(%rsp), %zmm21
597        vmovups   512(%rsp), %zmm22
598        vmovups   448(%rsp), %zmm23
599        vmovups   384(%rsp), %zmm24
600        vmovups   320(%rsp), %zmm25
601        vmovups   256(%rsp), %zmm27
602        vmovups   192(%rsp), %zmm28
603        vmovups   128(%rsp), %zmm29
604        vmovups   64(%rsp), %zmm30
605        vmovups   (%rsp), %zmm31
606        vmovups   1216(%rsp), %zmm2
607        movq      1000(%rsp), %rsi
608        movq      992(%rsp), %rdi
609        movq      1032(%rsp), %r12
610        cfi_restore (%r12)
611        movq      1024(%rsp), %r13
612        cfi_restore (%r13)
613        movq      1016(%rsp), %r14
614        cfi_restore (%r14)
615        movq      1008(%rsp), %r15
616        cfi_restore (%r15)
617        jmp       .LBL_2_2
618
619.LBL_2_10:
620        cfi_restore_state
621        movzbl    %r12b, %r15d
622        vmovss    1156(%rsp,%r15,8), %xmm1
623        vzeroupper
624        vmovss    1092(%rsp,%r15,8), %xmm0
625        call      JUMPTARGET(powf)
626        vmovss    %xmm0, 1220(%rsp,%r15,8)
627        jmp       .LBL_2_8
628
629.LBL_2_12:
630        movzbl    %r12b, %r15d
631        vmovss    1152(%rsp,%r15,8), %xmm1
632        vzeroupper
633        vmovss    1088(%rsp,%r15,8), %xmm0
634        call      JUMPTARGET(powf)
635        vmovss    %xmm0, 1216(%rsp,%r15,8)
636        jmp       .LBL_2_7
637END (_ZGVeN16vv_powf_skx)
638