1.file "erf.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 08/15/01 Initial version
42// 05/20/02 Cleaned up namespace and sf0 syntax
43// 02/06/03 Reordered header: .section, .global, .proc, .align
44// 03/31/05 Reformatted delimiters between data tables
45//
46// API
47//==============================================================
48// double erf(double)
49//
50// Overview of operation
51//==============================================================
52// Background
53//
54//
55// There are 9 paths:
56// 1. x = +/-0.0
57//    Return erf(x) = +/-0.0
58//
59// 2. 0.0 < |x| < 0.5
60//    Return erf(x) = x *Pol9(x^2)
61//
62// 3. For several subranges of 0.5 <= |x| < 5.90625
63//    Return erf(x) = sign(x)*Pol19(y),
64//    where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
65//
66//    For each subrange there is particular set of coefficients.
67//    Below is the list of subranges:
68//    3.1 0.5 <= |x| < 1.0     b = a = 0.5
69//    3.2 1.0 <= |x| < 2.0,    b = a = 1.0
70//    3.3 2.0 <= |x| < 3.25    b = a = 2.0
71//    3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0
72//
73// 4. 3.25 <= |x| < 4.0
74//    Return erf(x) = sign(x)*Pol14(|x| - 3.25)
75//
76// 5. 5.90625 <= |x| < +INF
77//    Return erf(x) = sign(x)*(1.0d - 2^(-63))
78//
79// 6. |x| = INF
80//    Return erf(x) = sign(x) * 1.0
81//
82// 7. x = [S,Q]NaN
83//    Return erf(x) = QNaN
84//
85// 8. x is positive denormal
86//    Return erf(x) = A0*x - x^2,
87//    where A0 = 2.0/sqrt(Pi)
88//
89// 9. x is negative denormal
90//    Return erf(x) = A0*x + x^2,
91//    where A0 = 2.0/sqrt(Pi)
92//
93// Registers used
94//==============================================================
95// Floating Point registers used:
96// f8, input, output
97// f32 -> f63
98
99// General registers used:
100// r32 -> r48, r2, r3
101
102// Predicate registers used:
103// p0, p6 -> p15
104
105// p6           to filter out case when x = denormal
106// p7           to filter out case when x = [Q,S]NaN or +/-0,
107//              used also to process denormals
108// p8           to filter out case when 3.25 <= |x| < 4.0,
109//              used also to process denormals
110// p9           to filter out case when |x| = inf
111// p10          to filter out case when |x| < 0.5
112// p11          set when |x| < 3.25 or |x| > 4.0
113// p12          to filter out case when |x| >= 5.90625
114// p13          set if 4.0 <=|x| < 5.90625
115// p14          set to 1 for positive x
116// p15          set to 1 for negative x
117
118// Assembly macros
119//==============================================================
120rDataPtr           = r2
121rDataPtr1          = r3
122
123rBias              = r33
124rCoeffAddr3        = r34
125rThreeAndQ         = r35
126rCoeffAddr2        = r36
127rMask              = r37
128rArg               = r38
129rSignBit           = r39
130rAbsArg            = r40
131rSaturation        = r41
132rIndex             = r42
133rCoeffAddr1        = r43
134rCoeffAddr4        = r44
135rShiftedArg        = r45
136rShiftedArgMasked  = r46
137rBiasedExpOf4      = r47
138rShiftedAbsArg     = r48
139
140//==============================================================
141fA0                = f32
142fA1                = f33
143fA2                = f34
144fA3                = f35
145fA4                = f36
146fA5                = f37
147fA6                = f38
148fA7                = f39
149fA8                = f40
150fA9                = f41
151fA10               = f42
152fA11               = f43
153fA12               = f44
154fA13               = f45
155fA14               = f46
156fA15               = f47
157fA16               = f48
158fA17               = f49
159fA18               = f50
160fA19               = f51
161fArgSqr            = f52
162fArgAbsNorm        = f53
163fSignumX           = f54
164fRes               = f55
165fThreeAndQ         = f56
166fArgAbs            = f57
167fTSqr              = f58
168fTQuadr            = f59
169fTDeg3             = f60
170fTDeg7             = f61
171fArgAbsNormSgn     = f62
172fTQuadrSgn         = f63
173
174// Data tables
175//==============================================================
176RODATA
177
178.align 64
179
180LOCAL_OBJECT_START(erf_data)
181// Coefficients ##0..15
182// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
183data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
184data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
185data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
186data8 0x8DAB171246CC2B89, 0x00003FDC //A16
187data8 0xC0B1D6662F8A7564, 0x00003FDF //A15
188data8 0xA46374AC35099BAF, 0x0000BFE1 //A14
189data8 0xB2F230996346EF27, 0x0000BFE4 //A13
190data8 0xCDEC50950FACE04A, 0x00003FE6 //A12
191data8 0x826014649396E9D2, 0x00003FE9 //A11
192data8 0xCDB787DC718B13F9, 0x0000BFEB //A10
193data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9
194data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8
195data8 0xB11E30BE912617D3, 0x00003FF0 //A7
196data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
197data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
198data8 0xBB793EF404C09A22, 0x00003FF8 //A4
199// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
200data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
201data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
202data8 0xA37B3242B7809E12, 0x00003FEC //A17
203data8 0xA0330A5CD2E91689, 0x0000BFED //A16
204data8 0x8E34A678F3497D17, 0x0000BFEC //A15
205data8 0xAC185D45A2772384, 0x00003FEF //A14
206data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13
207data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12
208data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11
209data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10
210data8 0xF71662D3132B7759, 0x0000BFF5 //A9
211data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8
212data8 0xF84B80EFCA43895D, 0x00003FF8 //A7
213data8 0x9722D22DA628A17B, 0x00003FF7 //A6
214data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
215data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
216// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
217data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
218data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
219data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
220data8 0xB99A33294D32429D, 0x0000BFFB //A16
221data8 0x8109D9C7197BC7C9, 0x00003FFB //A15
222data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14
223data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13
224data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12
225data8 0xDA4092509EE7D111, 0x00003FFC //A11
226data8 0x89D98C561B0C9040, 0x0000BFFD //A10
227data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9
228data8 0xD089C56948731561, 0x00003FFD //A8
229data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7
230data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
231data8 0xD673A02CB5766633, 0x00003FFD //A5
232data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
233// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
234data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
235data8 0xF76BE1935675D5C8, 0x00003FFE //A18
236data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
237data8 0x8BE8F573D348DDA4, 0x00004000 //A16
238data8 0x81E91923A1030502, 0x0000BFFF //A15
239data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14
240data8 0x84EF6B4E17404384, 0x00004000 //A13
241data8 0x91FEF33015404991, 0x0000C000 //A12
242data8 0xDEDF6A9370747E56, 0x00003FFF //A11
243data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10
244data8 0xFAD1CE912473937B, 0x00003FFD //A9
245data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8
246data8 0xFECAF0097ACF981B, 0x00003FFA //A7
247data8 0x8829A394065E4B95, 0x0000BFF9 //A6
248data8 0xED3003E477A53EE7, 0x00003FF6 //A5
249data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
250//
251// Coefficients ##16..19
252// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
253data8 0x95FA98C337005D13, 0x0000BFF9 //A3
254data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
255data8 0xE0F7E524D2808A98, 0x00003FFD //A1
256data8 0x853F7AE0C76E915F, 0x00003FFE //A0
257// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
258data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
259data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
260data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
261data8 0xD7BB3D3A08445636, 0x00003FFE //A0
262// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
263data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
264data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
265data8 0xA94DCF467CD10A16, 0x00003FFA //A1
266data8 0xFECD70A13CAF1997, 0x00003FFE //A0
267// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
268data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
269data8 0x8858A465CE594BD1, 0x0000BFEE //A2
270data8 0x8858A447456DE61D, 0x00003FEA //A1
271data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
272// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
273data8 0xBE839EDBB36C7FCE //A9
274data8 0x3EBB7745A18DD242 //A8
275data8 0xBF4C02DB238F2AFC //A5
276data8 0x3F7565BCD0A9A3EA //A4
277data8 0xC093A3581BCF3333, 0x0000BFFD //A1
278data8 0xBEEF4BB82AD8AE22 //A7
279data8 0x3F1F9A2A57A218CD //A6
280data8 0xBF9B82CE3127F4E4 //A3
281data8 0x3FBCE2F21A042B25 //A2
282data8 0x906EBA8214DB688D, 0x00003FFF //A0
283// 1.0 - 2^(-63)
284data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
285// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
286data8 0x95E91576C7A12250, 0x00003FE7 //A14
287data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
288data8 0xED761DAFAF814DE9, 0x00003FEB //A12
289data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11
290data8 0xA662D27096B08D7C, 0x0000BFEC //A10
291data8 0xDA0F410AE6233EA5, 0x00003FEF //A9
292data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8
293data8 0xB241E236A5EDCED3, 0x00003FF2 //A7
294data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6
295data8 0xA4852D0B1D87000A, 0x00003FF3 //A5
296data8 0x963EB00039489476, 0x0000BFF3 //A4
297data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3
298data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2
299data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
300data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
301// A = 2.0/sqrt(Pi)
302data8 0x906EBA8214DB688D, 0x00003FFF
303LOCAL_OBJECT_END(erf_data)
304
305
306.section .text
307GLOBAL_LIBM_ENTRY(erf)
308
309{ .mfi
310      alloc          r32 = ar.pfs, 0, 17, 0, 0
311      fmerge.se      fArgAbsNorm = f1, f8         // normalized x
312      adds           rSignBit = 0x1, r0
313}
314{ .mfi
315      addl           rDataPtr = @ltoff(erf_data), gp
316      fma.s1         fArgSqr = f8, f8, f0         // x^2
317      addl           rThreeAndQ = 0x400A0, r0     // shifted bits of 3.25
318}
319;;
320{ .mfi
321      getf.d         rArg = f8                    // x in GR
322      fclass.m       p6,p0 = f8, 0x0b             // is x denormal ?
323      shl            rThreeAndQ = rThreeAndQ, 44  // bits of 3.25
324}
325{ .mfi
326      ld8            rDataPtr = [rDataPtr]
327      nop.f          0
328      addl           rBiasedExpOf4 = 0x40100, r0  // shifted bits of 4.0
329}
330;;
331{ .mfi
332      addl           rSaturation = 0x4017A, r0    // shifted bits of 5.90625
333      fclass.m       p7,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
334      shl            rSignBit = rSignBit, 63      // mask for sign bit
335}
336{ .mfi
337      addl           rMask = 0x7FF00, r0          // Mask for index bits
338      nop.f          0
339      addl           rBias = 0x3FE00, r0          // bias of 0.5 << 8
340}
341;;
342{ .mfi
343      setf.d         fThreeAndQ = rThreeAndQ      // 3.25 if FP register
344      fclass.m       p9,p0 = f8, 0x23             // is x +/- inf?
345      shr.u          rShiftedArg = rArg, 44
346}
347{ .mfb
348      andcm          rAbsArg = rArg, rSignBit     // |x| in GR
349      nop.f          0
350(p6)  br.cond.spnt   erf_denormal                 // branch out if x is denormal
351}
352;;
353{ .mfi
354      and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
355      fmerge.s       fArgAbs = f1, f8             // |x|
356      shr            rShiftedAbsArg = rAbsArg, 44
357}
358{ .mfb
359      cmp.lt         p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25
360(p7)  fma.d.s0       f8 = f8,f1,f8                // NaN or +/-0
361(p7)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
362}
363;;
364{ .mfi
365      sub            rIndex = rShiftedArgMasked, rBias // index << 8
366      nop.f          0
367      cmp.lt         p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
368}
369{ .mfb
370      // p8 = 1 if 3.25 <= |x| < 4.0
371(p8)  cmp.lt         p8, p11 = rShiftedAbsArg, rBiasedExpOf4
372      fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1
373(p10) br.cond.spnt   erf_near_zero // branch out if |x| < 0.5
374}
375;;
376.pred.rel "mutex", p8, p11
377{ .mfi
378(p8)  adds           rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0
379(p9)  fmerge.s       f8 = f8,f1                   // +/- inf
380      nop.i          0
381}
382{ .mfb
383(p11) add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
384      nop.f          0
385(p9)  br.ret.spnt    b0                            // exit for x = +/- inf
386}
387;;
388{ .mfi
389      adds           rCoeffAddr2 = 16, rCoeffAddr1
390      fmerge.s       fSignumX = f8, f1            // signum(x)
391      nop.i          0
392}
393{ .mfb
394      cmp.lt         p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
395      nop.f          0
396(p12) br.cond.spnt   erf_saturation               // branch out if x |x| >= 6.0
397}
398;;
399// Here if paths #3,4
400// if path #4 we'll branch out after loading of 14 necessary coefficients
401{.mfi
402      ldfe           fA19 = [rCoeffAddr1], 32
403      nop.f          0
404      nop.i          0
405}
406{.mfi
407      ldfe           fA18 = [rCoeffAddr2], 32
408      nop.f          0
409      adds           rCoeffAddr3 = 1024, rDataPtr
410}
411;;
412{.mfi
413      ldfe           fA17 = [rCoeffAddr1], 32
414      nop.f          0
415      nop.i          0
416}
417{.mfi
418      ldfe           fA16 = [rCoeffAddr2], 32
419      nop.f          0
420      nop.i          0
421}
422;;
423{.mfi
424      ldfe           fA15 = [rCoeffAddr1], 32
425      fma.s1         fTSqr = fArgAbsNorm, fArgAbsNorm, f0
426      shr.u          rIndex = rIndex, 2
427}
428{.mfi
429      ldfe           fA14 = [rCoeffAddr2], 32
430      nop.f          0
431      adds           rCoeffAddr4 = 16, r0
432}
433;;
434{.mfi
435      ldfe           fA13 = [rCoeffAddr1], 32
436      nop.f          0
437      // address of coefficients ##16..23
438      add            rCoeffAddr3 = rCoeffAddr3, rIndex
439}
440{.mfi
441      ldfe           fA12 = [rCoeffAddr2], 32
442      nop.f          0
443      cmp.lt         p15, p14 = rArg, r0
444}
445;;
446{.mfi
447      ldfe           fA11 = [rCoeffAddr1], 32
448      nop.f          0
449      add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4
450}
451{.mfi
452      ldfe           fA10 = [rCoeffAddr2], 32
453      nop.f          0
454      nop.i          0
455}
456;;
457{.mfi
458      ldfe           fA9 = [rCoeffAddr1], 32
459      nop.f          0
460      nop.i          0
461}
462{.mfi
463      ldfe           fA8 = [rCoeffAddr2], 32
464      nop.f          0
465      nop.i          0
466}
467;;
468{.mfi
469      ldfe           fA7 = [rCoeffAddr1], 32
470      fms.s1         fArgAbs = fArgAbs, f1, fThreeAndQ
471      nop.i          0
472}
473{.mfb
474      ldfe           fA6 = [rCoeffAddr2], 32
475      nop.f          0
476(p8)  br.cond.spnt   erf_3q_4 // branch out if  3.25 < |x| < 4.0
477}
478;;
479{.mfi
480      ldfe           fA5 = [rCoeffAddr1], 32
481      fma.s1         fTDeg3 = fArgAbsNorm, fTSqr, f0
482      nop.i          0
483}
484{.mfi
485      ldfe           fA4 = [rCoeffAddr2], 32
486      fma.s1         fTQuadr = fTSqr, fTSqr, f0
487      nop.i          0
488}
489;;
490// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
491{.mfi
492      ldfe           fA3 = [rCoeffAddr3], 32
493      fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0
494      nop.i          0
495}
496{.mfi
497      ldfe           fA2 = [rCoeffAddr4], 32
498      nop.f          0
499      nop.i          0
500}
501;;
502{.mfi
503      ldfe           fA1 = [rCoeffAddr3], 32
504      fma.s1         fRes = fA19, fArgAbsNorm, fA18
505      nop.i          0
506}
507{.mfi
508      ldfe           fA0 = [rCoeffAddr4], 32
509      nop.f          0
510      nop.i          0
511}
512;;
513{ .mfi
514      nop.m          0
515      fma.s1         fA17 = fA17, fArgAbsNorm, fA16
516      nop.i          0
517}
518;;
519{ .mfi
520      nop.m          0
521      fma.s1         fA15 = fA15, fArgAbsNorm, fA14
522      nop.i          0
523}
524;;
525{ .mfi
526      nop.m          0
527      fma.s1         fTDeg7 = fTDeg3, fTQuadr, f0
528      nop.i          0
529}
530{ .mfi
531      nop.m          0
532      fma.s1         fA13 = fA13, fArgAbsNorm, fA12
533      nop.i          0
534}
535;;
536{ .mfi
537      nop.m          0
538      fma.s1         fA11 = fA11, fArgAbsNorm, fA10
539      nop.i          0
540}
541;;
542{ .mfi
543      nop.m          0
544      fma.s1         fA9 = fA9, fArgAbsNorm, fA8
545      nop.i          0
546}
547;;
548{ .mfi
549      nop.m          0
550      fma.s1         fRes = fRes, fTSqr, fA17
551      nop.i          0
552}
553{ .mfi
554      nop.m          0
555      fma.s1         fA7 = fA7, fArgAbsNorm, fA6
556      nop.i          0
557}
558;;
559{ .mfi
560      nop.m          0
561      fma.s1         fA5 = fA5, fArgAbsNorm, f0
562      nop.i          0
563}
564;;
565{ .mfi
566      nop.m          0
567      fma.s1         fA15 = fA15, fTSqr, fA13
568      nop.i          0
569}
570{ .mfi
571      nop.m          0
572      fma.s1         fA4 = fA4, fArgAbsNorm, fA3
573      nop.i          0
574}
575;;
576{ .mfi
577      nop.m          0
578      fma.s1         fA2 = fA2, fArgAbsNorm, fA1
579      nop.i          0
580}
581;;
582{ .mfi
583      nop.m          0
584      fma.s1         fA11 = fA11, fTSqr, fA9
585      nop.i          0
586}
587;;
588{ .mfi
589      nop.m          0
590      fma.s1         fA7 = fA7, fTSqr, fA5
591      nop.i          0
592}
593;;
594{ .mfi
595      nop.m          0
596      fma.s1         fRes = fRes, fTQuadr, fA15
597      nop.i          0
598}
599;;
600{ .mfi
601      nop.m          0
602      fma.s1         fA4 = fA4, fTSqr, fA2
603      nop.i          0
604}
605;;
606{ .mfi
607      nop.m          0
608      fma.s1         fRes = fRes, fTQuadr, fA11
609      nop.i          0
610}
611;;
612{ .mfi
613      nop.m          0
614      fma.s1         fA4 = fA7, fTDeg3, fA4
615      nop.i          0
616}
617;;
618{ .mfi
619      nop.m          0
620      fma.s1         fRes = fRes,  fTDeg7, fA4
621      nop.i          0
622}
623;;
624{ .mfi
625      nop.m          0
626      // result for negative argument
627(p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
628      nop.i          0
629}
630{ .mfb
631      nop.m          0
632      // result for positive argument
633(p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
634      br.ret.sptk    b0
635}
636
637// Here if  3.25 < |x| < 4.0
638.align 32
639erf_3q_4:
640.pred.rel "mutex", p14, p15
641{ .mfi
642      ldfe           fA5 = [rCoeffAddr1], 32
643      fma.s1         fTSqr = fArgAbs, fArgAbs, f0
644      nop.i          0
645}
646{ .mfi
647      nop.m          0
648      fma.s1         fRes = fA19, fArgAbs, fA18
649      nop.i          0
650}
651;;
652{ .mfi
653      nop.m          0
654      fma.s1         fA17 = fA17, fArgAbs, fA16
655      nop.i          0
656}
657{ .mfi
658      nop.m          0
659      fma.s1         fA15 = fA15, fArgAbs, fA14
660      nop.i          0
661}
662;;
663{ .mfi
664      nop.m          0
665      fma.s1         fA13 = fA13, fArgAbs, fA12
666      nop.i          0
667}
668{ .mfi
669      nop.m          0
670      fma.s1         fA11 = fA11, fArgAbs, fA10
671      nop.i          0
672}
673;;
674{ .mfi
675      nop.m          0
676      fma.s1         fA9 = fA9, fArgAbs, fA8
677      nop.i          0
678}
679{ .mfi
680      nop.m          0
681      fma.s1         fArgAbsNormSgn = fArgAbs, fSignumX, f0
682      nop.i          0
683}
684;;
685{ .mfi
686      nop.m          0
687      fma.s1         fTQuadr = fTSqr, fTSqr, f0
688      nop.i          0
689}
690;;
691{ .mfi
692      nop.m          0
693      fma.s1         fRes = fRes, fTSqr, fA17
694      nop.i          0
695}
696;;
697{ .mfi
698      nop.m          0
699      fma.s1         fA15 = fA15, fTSqr, fA13
700      nop.i          0
701}
702;;
703{ .mfi
704      nop.m          0
705      fma.s1         fA11 = fA11, fTSqr, fA9
706      nop.i          0
707}
708{ .mfi
709      nop.m          0
710      fma.s1         fA7 = fA7, fArgAbs, fA6
711      nop.i          0
712}
713;;
714{ .mfi
715      nop.m          0
716      fma.s1         fTDeg7 = fTQuadr, fTSqr, f0
717      nop.i          0
718}
719{ .mfi
720      nop.m          0
721      fma.s1         fRes = fRes, fTQuadr, fA15
722      nop.i          0
723}
724;;
725{ .mfi
726      nop.m          0
727      fma.s1         fA11 = fA11, fTSqr, fA7
728      nop.i          0
729}
730;;
731{ .mfi
732      nop.m          0
733      fma.s1         fRes = fRes, fTDeg7, fA11
734      nop.i          0
735}
736;;
737{ .mfi
738      nop.m          0
739      // result for negative argument
740(p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
741      nop.i          0
742}
743{ .mfb
744      nop.m          0
745      // result for positive argument
746(p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
747      br.ret.sptk    b0
748}
749;;
750
751// Here if |x| < 0.5
752.align 32
753erf_near_zero:
754{ .mfi
755      adds           rCoeffAddr1 = 1280, rDataPtr // address of A9
756      fma.s1         fTSqr = fArgSqr, fArgSqr, f0 // x^4
757      nop.i          0
758}
759{ .mfi
760      adds           rCoeffAddr2 = 1328, rDataPtr // address of A7
761      nop.f          0
762      nop.i          0
763}
764;;
765{ .mfi
766      ldfpd          fA9, fA8 = [rCoeffAddr1], 16
767      nop.f          0
768      nop.i          0
769}
770{ .mfi
771      ldfpd          fA7, fA6 = [rCoeffAddr2], 16
772      nop.f          0
773      nop.i          0
774}
775;;
776{ .mfi
777      ldfpd          fA5, fA4 = [rCoeffAddr1], 16
778      nop.f          0
779      nop.i          0
780}
781{ .mfi
782      ldfpd          fA3, fA2 = [rCoeffAddr2], 16
783      nop.f          0
784      nop.i          0
785}
786;;
787{ .mfi
788      ldfe           fA1 = [rCoeffAddr1]
789      nop.f          0
790      nop.i          0
791}
792{ .mfi
793      ldfe           fA0 = [rCoeffAddr2]
794      nop.f          0
795      nop.i          0
796}
797;;
798{ .mfi
799      nop.m          0
800      fma.s1         fTQuadr = fTSqr, fTSqr, f0
801      nop.i          0
802}
803;;
804{ .mfi
805      nop.m          0
806      fma.s1         fRes = fA9, fArgSqr, fA8
807      nop.i          0
808}
809{ .mfi
810      nop.m          0
811      fma.s1         fA7 = fA7, fArgSqr, fA6
812      nop.i          0
813}
814;;
815{ .mfi
816      nop.m          0
817      fma.s1         fA3 = fA3, fArgSqr, fA2
818      nop.i          0
819}
820{ .mfi
821      nop.m          0
822      fma.s1         fA5 = fA5, fArgSqr, fA4
823      nop.i          0
824}
825;;
826{ .mfi
827      nop.m          0
828      fma.s1         fA1 = fA1, fArgSqr, fA0
829      nop.i          0
830}
831{ .mfi
832      nop.m          0
833      fma.s1         fTQuadrSgn = fTQuadr, f8, f0
834      nop.i          0
835}
836;;
837{ .mfi
838      nop.m          0
839      fma.s1         fRes = fRes, fTSqr, fA7
840      nop.i          0
841}
842;;
843{ .mfi
844      nop.m          0
845      fma.s1         fA1 = fA3, fTSqr, fA1
846      nop.i          0
847}
848;;
849{ .mfi
850      nop.m          0
851      fma.s1         fRes = fRes, fTSqr, fA5
852      nop.i          0
853}
854;;
855{ .mfi
856      nop.m          0
857      fma.s1         fA1 = fA1, f8, f0
858      nop.i          0
859}
860;;
861{ .mfb
862      nop.m          0
863      fma.d.s0       f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2)
864      br.ret.sptk    b0                              // Exit for |x| < 0.5
865};;
866
867// Here if 5.90625 <= |x| < +inf
868.align 32
869erf_saturation:
870{ .mfi
871      adds           rDataPtr = 1376, rDataPtr     // address of A0
872      nop.f          0
873      nop.i          0
874}
875;;
876{ .mfi
877      ldfe           fA0 = [rDataPtr]
878      nop.f          0
879      nop.i          0
880}
881;;
882{ .mfb
883      nop.m          0
884      fma.d.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0 - 2^(-63))
885      // Exit for 5.90625 <= |x| < +inf
886      br.ret.sptk    b0                          // Exit for 5.90625 <=|x|< +inf
887}
888;;
889
890// Here if x is double precision denormal
891.align 32
892erf_denormal:
893{ .mfi
894      adds           rDataPtr = 1632, rDataPtr    // address of A0
895      fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
896      nop.i          0
897}
898;;
899{ .mfi
900      ldfe           fA0 = [rDataPtr]             // A0
901      nop.f          0
902      nop.i          0
903}
904;;
905{ .mfi
906      nop.m          0
907      fma.s1         fA0 = fA0,f8,f0              // A0*x
908      nop.i          0
909}
910;;
911{ .mfi
912      nop.m          0
913(p7)  fma.d.s0       f8 = f8,f8,fA0               // -denormal
914      nop.i          0
915}
916{ .mfb
917      nop.m          0
918(p8)  fnma.d.s0      f8 = f8,f8,fA0               // +denormal
919      br.ret.sptk    b0                           // Exit for denormal
920}
921;;
922
923GLOBAL_LIBM_END(erf)
924libm_alias_double_other (erf, erf)
925