1.file "erff.s"
2
3
4// Copyright (c) 2001 - 2005, Intel Corporation
5// All rights reserved.
6//
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12// * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//
15// * Redistributions in binary form must reproduce the above copyright
16// notice, this list of conditions and the following disclaimer in the
17// documentation and/or other materials provided with the distribution.
18//
19// * The name of Intel Corporation may not be used to endorse or promote
20// products derived from this software without specific prior written
21// permission.
22
23// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34//
35// Intel Corporation is the author of this code, and requests that all
36// problem reports or change requests be submitted to it directly at
37// http://www.intel.com/software/products/opensource/libraries/num.htm.
38//
39// History
40//==============================================================
41// 08/14/01 Initial version
42// 05/20/02 Cleaned up namespace and sf0 syntax
43// 02/06/03 Reordered header: .section, .global, .proc, .align
44// 03/31/05 Reformatted delimiters between data tables
45//
46// API
47//==============================================================
48// float erff(float)
49//
50// Overview of operation
51//==============================================================
52// Background
53//
54//
55// There are 8 paths:
56// 1. x = +/-0.0
57//    Return erff(x) = +/-0.0
58//
59// 2. 0.0 < |x| < 0.125
60//    Return erff(x) = x *Pol3(x^2),
61//    where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
62//
63// 3. 0.125 <= |x| < 4.0
64//    Return erff(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
65//    where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
66//          PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
67//          PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
68//
69//    Actually range 0.125<=|x|< 4.0 is splitted to 5 subranges.
70//    For each subrange there is particular set of coefficients.
71//    Below is the list of subranges:
72//    3.1 0.125 <= |x| < 0.25
73//    3.2 0.25 <= |x| < 0.5
74//    3.3 0.5 <= |x| < 1.0
75//    3.4 1.0 <= |x| < 2.0
76//    3.5 2.0 <= |x| < 4.0
77//
78// 4. 4.0 <= |x| < +INF
79//    Return erff(x) = sign(x)*(1.0d - 2^(-52))
80//
81// 5. |x| = INF
82//    Return erff(x) = sign(x) * 1.0
83//
84// 6. x = [S,Q]NaN
85//    Return erff(x) = QNaN
86//
87// 7. x is positive denormal
88//    Return erff(x) = C0*x - x^2,
89//    where C0 = 2.0/sqrt(Pi)
90//
91// 8. x is negative denormal
92//    Return erff(x) = C0*x + x^2,
93//    where C0 = 2.0/sqrt(Pi)
94//
95// Registers used
96//==============================================================
97// Floating Point registers used:
98// f8, input
99// f32 -> f59
100
101// General registers used:
102// r32 -> r45, r2, r3
103
104// Predicate registers used:
105// p0, p6 -> p12, p14, p15
106
107// p6           to filter out case when x = [Q,S]NaN or +/-0
108// p7           to filter out case when x = denormal
109// p8           set if |x| >= 0.3125, used also to process denormal input
110// p9           to filter out case when |x| = inf
111// p10          to filter out case when |x| < 0.125
112// p11          to filter out case when 0.125 <= |x| < 4.0
113// p12          to filter out case when |x| >= 4.0
114// p14          set to 1 for positive x
115// p15          set to 1 for negative x
116
117// Assembly macros
118//==============================================================
119rDataPtr           = r2
120rDataPtr1          = r3
121
122rBias              = r33
123rCoeffAddr3        = r34
124rCoeffAddr1        = r35
125rCoeffAddr2        = r36
126rOffset2           = r37
127rBias2             = r38
128rMask              = r39
129rArg               = r40
130rBound             = r41
131rSignBit           = r42
132rAbsArg            = r43
133rDataPtr2          = r44
134rSaturation        = r45
135
136//==============================================================
137fA0                = f32
138fA1                = f33
139fA2                = f34
140fA3                = f35
141fC0                = f36
142fC1                = f37
143fC2                = f38
144fC3                = f39
145fD0                = f40
146fD1                = f41
147fD2                = f42
148fB0                = f43
149fArgSqr            = f44
150fAbsArg            = f45
151fSignumX           = f46
152fArg4              = f47
153fArg4Sgn           = f48
154fArg3              = f49
155fArg3Sgn           = f50
156fArg7Sgn           = f51
157fArg6Sgn           = f52
158fPolC              = f53
159fPolCTmp           = f54
160fPolA              = f55
161fPolATmp           = f56
162fPolD              = f57
163fPolDTmp           = f58
164fArgSqrSgn         = f59
165
166// Data tables
167//==============================================================
168
169RODATA
170
171.align 16
172
173LOCAL_OBJECT_START(erff_data)
174// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
175data8 0xBE4218BB56B49E66 // C0
176data8 0x3F7AFB8315DA322B // C1
177data8 0x3F615D6EBEE0CA32 // C2
178data8 0xBF468D71CF4F0918 // C3
179data8 0x40312115B0932F24 // D0
180data8 0xC0160D6CD0991EA3 // D1
181data8 0xBFE04A567A6DBE4A // D2
182data8 0xBF4207BC640D1509 // B0
183// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
184data8 0x3F90849356383F58 // C0
185data8 0x3F830BD5BA240F09 // C1
186data8 0xBF3FA4970E2BCE23 // C2
187data8 0xBF6061798E58D0FD // C3
188data8 0xBF68C0D83DD22E02 // D0
189data8 0x401C0A9EE4108F94 // D1
190data8 0xC01056F9B5E387F5 // D2
191data8 0x3F1C9744E36A5706 // B0
192// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
193data8 0x3F85F7D419A13DE3 // C0
194data8 0x3F791A13FF66D45A // C1
195data8 0x3F46B17B16B5929F // C2
196data8 0xBF5124947A8BF45E // C3
197data8 0x3FA1B3FD95EA9564 // D0
198data8 0x40250CECD79A020A // D1
199data8 0xC0190DC96FF66CCD // D2
200data8 0x3F4401AE28BA4DD5 // B0
201// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
202data8 0xBF49E07E3584C3AE // C0
203data8 0x3F3166621131445C // C1
204data8 0xBF65B7FC1EAC2099 // C2
205data8 0x3F508C6BD211D736 // C3
206data8 0xC053FABD70601067 // D0
207data8 0x404A06640EE87808 // D1
208data8 0xC0283F30817A3F08 // D2
209data8 0xBF2F6DBBF4D6257F // B0
210// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
211data8 0xBF849855D67E9407 // C0
212data8 0x3F5ECA5FEC01C70C // C1
213data8 0xBF483110C30FABA4 // C2
214data8 0x3F1618DA72860403 // C3
215data8 0xC08A5C9D5FE8B9F6 // D0
216data8 0x406EFF5F088CEC4B // D1
217data8 0xC03A5743DF38FDE0 // D2
218data8 0xBEE397A9FA5686A2 // B0
219// Polynomial coefficients for the erf(x), -0.125 < x < 0.125
220data8 0x3FF20DD7504270CB // C0
221data8 0xBFD8127465AFE719 // C1
222data8 0x3FBCE2D77791DD77 // C2
223data8 0xBF9B582755CDF345 // C3
224// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
225data8 0xBD54E7E451AF0E36 // A0
226data8 0x3FF20DD75043FE20 // A1
227data8 0xBE05680ACF8280E4 // A2
228data8 0xBFD812745E92C3D3 // A3
229// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
230data8 0xBE1ACEC2859CB55F // A0
231data8 0x3FF20DD75E8D2B64 // A1
232data8 0xBEABC6A83208FCFC // A2
233data8 0xBFD81253E42E7B99 // A3
234// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
235data8 0x3EABD5A2482B4979 // A0
236data8 0x3FF20DCAA52085D5 // A1
237data8 0x3F13A994A348795B // A2
238data8 0xBFD8167B2DFCDE44 // A3
239// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
240data8 0xBF5BA377DDAB4E17 // A0
241data8 0x3FF2397F1D8FC0ED // A1
242data8 0xBF9945BFC1915C21 // A2
243data8 0xBFD747AAABB690D8 // A3
244// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
245data8 0x3FF0E2920E0391AF // A0
246data8 0xC00D249D1A95A5AE // A1
247data8 0x40233905061C3803 // A2
248data8 0xC027560B851F7690 // A3
249//
250data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
251data8 0x3FF20DD750429B6D // C0 = 2.0/sqrt(Pi)
252LOCAL_OBJECT_END(erff_data)
253
254
255.section .text
256GLOBAL_LIBM_ENTRY(erff)
257
258{ .mfi
259      alloc          r32 = ar.pfs, 0, 14, 0, 0
260      fmerge.s       fAbsArg = f1, f8             // |x|
261      addl           rMask = 0x806, r0
262}
263{ .mfi
264      addl           rDataPtr = @ltoff(erff_data), gp
265      fma.s1         fArgSqr = f8, f8, f0         // x^2
266      adds           rSignBit = 0x1, r0
267}
268;;
269
270{ .mfi
271      getf.s         rArg = f8                    // x in GR
272      fclass.m       p7,p0 = f8, 0x0b             // is x denormal ?
273      // sign bit and 2 most bits in significand
274      shl            rMask = rMask, 20
275}
276{ .mfi
277      ld8            rDataPtr = [rDataPtr]
278      nop.f          0
279      adds           rBias2 = 0x1F0, r0
280}
281;;
282
283{ .mfi
284      nop.m          0
285      fmerge.s       fSignumX = f8, f1            // signum(x)
286      shl            rSignBit = rSignBit, 31      // mask for sign bit
287}
288{ .mfi
289      adds           rBound = 0x3E0, r0
290      nop.f          0
291      adds           rSaturation = 0x408, r0
292}
293;;
294
295{ .mfi
296      andcm          rOffset2 = rArg, rMask
297      fclass.m       p6,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
298      shl            rBound = rBound, 20          // 0.125f in GR
299}
300{ .mfb
301      andcm          rAbsArg = rArg, rSignBit     // |x| in GR
302      nop.f          0
303(p7)  br.cond.spnt   erff_denormal               // branch out if x is denormal
304}
305;;
306
307{ .mfi
308      adds           rCoeffAddr2 = 352, rDataPtr
309      fclass.m       p9,p0 = f8, 0x23            // is x +/- inf?
310      shr            rOffset2 = rOffset2, 21
311}
312{ .mfi
313      cmp.lt         p10, p8 = rAbsArg, rBound   // |x| < 0.125?
314      nop.f          0
315      adds           rCoeffAddr3 = 16, rDataPtr
316}
317;;
318
319{ .mfi
320(p8)  sub            rBias = rOffset2, rBias2
321      fma.s1         fArg4 = fArgSqr, fArgSqr, f0 // x^4
322      shl            rSaturation = rSaturation, 20// 4.0 in GR (saturation bound)
323}
324{ .mfb
325(p10) adds           rBias = 0x14, r0
326(p6)  fma.s.s0       f8 = f8,f1,f8                // NaN or +/-0
327(p6)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
328}
329;;
330
331{ .mfi
332      shladd         rCoeffAddr1 = rBias, 4, rDataPtr
333      fma.s1         fArg3Sgn = fArgSqr, f8, f0  // sign(x)*|x|^3
334      // is |x| < 4.0?
335      cmp.lt         p11, p12 = rAbsArg, rSaturation
336}
337{ .mfi
338      shladd         rCoeffAddr3 = rBias, 4, rCoeffAddr3
339      fma.s1         fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
340      shladd         rCoeffAddr2 = rBias, 3, rCoeffAddr2
341}
342;;
343
344{ .mfi
345(p11) ldfpd          fC0, fC1 = [rCoeffAddr1]
346(p9)  fmerge.s       f8 = f8,f1                   // +/- inf
347(p12) adds           rDataPtr = 512, rDataPtr
348}
349{ .mfb
350(p11) ldfpd          fC2, fC3 = [rCoeffAddr3], 16
351      nop.f          0
352(p9)  br.ret.spnt    b0                           // exit for x = +/- inf
353}
354;;
355
356{ .mfi
357(p11) ldfpd          fA0, fA1 = [rCoeffAddr2], 16
358      nop.f          0
359      nop.i          0
360}
361{ .mfi
362      add            rCoeffAddr1 = 48, rCoeffAddr1
363      nop.f          0
364      nop.i          0
365}
366;;
367
368{ .mfi
369(p11) ldfpd          fD0, fD1 = [rCoeffAddr3]
370      nop.f          0
371      nop.i          0
372}
373{ .mfb
374(p11) ldfpd          fD2, fB0 = [rCoeffAddr1]
375      // sign(x)*|x|^2
376      fma.s1         fArgSqrSgn = fArgSqr, fSignumX, f0
377(p10) br.cond.spnt   erff_near_zero
378}
379;;
380
381{ .mfi
382(p11) ldfpd          fA2, fA3 = [rCoeffAddr2], 16
383      fcmp.lt.s1     p15, p14 = f8,f0
384      nop.i          0
385}
386{ .mfb
387(p12) ldfd           fA0 = [rDataPtr]
388      fma.s1         fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
389(p12) br.cond.spnt   erff_saturation
390}
391;;
392{ .mfi
393      nop.m          0
394      fma.s1         fArg7Sgn = fArg4, fArg3Sgn, f0  // sign(x)*|x|^7
395      nop.i          0
396}
397{ .mfi
398      nop.m          0
399      fma.s1         fArg6Sgn = fArg3, fArg3Sgn, f0  // sign(x)*|x|^6
400      nop.i          0
401}
402;;
403
404{ .mfi
405      nop.m          0
406      fma.s1         fPolC = fC3, fAbsArg, fC2    // C3*|x| + C2
407      nop.i          0
408}
409{ .mfi
410      nop.m          0
411      fma.s1         fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
412      nop.i          0
413};;
414
415{ .mfi
416      nop.m          0
417      fma.s1         fPolA = fA1, fAbsArg, fA0    // A1*|x| + A0
418      nop.i          0
419}
420;;
421
422{ .mfi
423      nop.m          0
424      fma.s1         fPolD = fD1, fAbsArg, fD0    // D1*|x| + D0
425      nop.i          0
426}
427{ .mfi
428      nop.m          0
429      // sign(x)*(|x|^7 + D2*x^6)
430      fma.s1         fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
431      nop.i          0
432};;
433
434{ .mfi
435      nop.m          0
436      fma.s1         fPolATmp = fA3, fAbsArg, fA2  // A3*|x| + A2
437      nop.i          0
438}
439{ .mfi
440      nop.m          0
441      fma.s1         fB0 = fB0, fArg4, f0          // B0*x^4
442      nop.i          0
443};;
444
445{ .mfi
446      nop.m          0
447      // C3*|x|^3 + C2*x^2 + C1*|x| + C0
448      fma.s1         fPolC = fPolC, fArgSqr, fPolCTmp
449      nop.i          0
450}
451;;
452
453{ .mfi
454      nop.m          0
455      // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
456      fma.d.s1       fPolD = fPolD, fArg4Sgn, fPolDTmp
457      nop.i          0
458}
459;;
460
461{ .mfi
462      nop.m          0
463      // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
464      fma.d.s1       fPolA = fPolATmp, fArgSqr, fPolA
465      nop.i          0
466}
467;;
468
469{ .mfi
470      nop.m          0
471      // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
472      fma.d.s1       fPolC = fPolC, f1, fB0
473      nop.i          0
474}
475;;
476
477{ .mfi
478      nop.m          0
479(p14) fma.s.s0       f8 = fPolC, fPolD, fPolA     // for positive x
480      nop.i          0
481}
482{ .mfb
483      nop.m          0
484(p15) fms.s.s0       f8 = fPolC, fPolD, fPolA     // for negative x
485      br.ret.sptk    b0                           // Exit for 0.125 <=|x|< 4.0
486};;
487
488
489// Here if |x| < 0.125
490erff_near_zero:
491{ .mfi
492      nop.m          0
493      fma.s1         fPolC = fC3, fArgSqr, fC2    // C3*x^2 + C2
494      nop.i          0
495}
496{ .mfi
497      nop.m          0
498      fma.s1         fPolCTmp = fC1, fArgSqr, fC0  // C1*x^2 + C0
499      nop.i          0
500};;
501
502{ .mfi
503      nop.m          0
504      fma.s1         fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
505      nop.i          0
506};;
507
508{ .mfb
509      nop.m          0
510      // x*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
511      fma.s.s0       f8 = fPolC, f8, f0
512      br.ret.sptk    b0                           // Exit for |x| < 0.125
513};;
514
515// Here if 4.0 <= |x| < +inf
516erff_saturation:
517{ .mfb
518      nop.m          0
519      fma.s.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0d - 2^(-52))
520      // Exit for 4.0 <= |x| < +inf
521      br.ret.sptk    b0                           // Exit for 4.0 <=|x|< +inf
522}
523;;
524
525// Here if x is single precision denormal
526erff_denormal:
527{ .mfi
528      adds           rDataPtr = 520, rDataPtr     // address of C0
529      fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
530      nop.i          0
531}
532;;
533{ .mfi
534      ldfd           fC0 = [rDataPtr]             // C0
535      nop.f          0
536      nop.i          0
537}
538;;
539{ .mfi
540      nop.m          0
541      fma.s1         fC0 = fC0,f8,f0              // C0*x
542      nop.i          0
543}
544;;
545{ .mfi
546      nop.m          0
547(p7)  fma.s.s0       f8 = f8,f8,fC0               // -denormal
548      nop.i          0
549}
550{ .mfb
551      nop.m          0
552(p8)  fnma.s.s0      f8 = f8,f8,fC0               // +denormal
553      br.ret.sptk    b0                           // Exit for denormal
554}
555;;
556
557GLOBAL_LIBM_END(erff)
558libm_alias_float_other (erf, erf)
559