Skip to content

Commit 58d2ed8

Browse files
Port Acos from optimized aocl-libm-ose source with Sollya polynomial coefficients
Agent-Logs-Url: https://github.com/dotnet/runtime/sessions/22f9ee13-f6b9-4c35-ad05-63ff26534e31 Co-authored-by: tannergooding <10487869+tannergooding@users.noreply.github.com>
1 parent bf6385a commit 58d2ed8

1 file changed

Lines changed: 95 additions & 106 deletions

File tree

  • src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/VectorMath.cs

Lines changed: 95 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -3227,91 +3227,83 @@ public static TVectorDouble AcosDouble<TVectorDouble, TVectorUInt64>(TVectorDoub
32273227
where TVectorUInt64 : unmanaged, ISimdVector<TVectorUInt64, ulong>
32283228
{
32293229
// This code is based on `acos` from amd/aocl-libm-ose
3230-
// Copyright (C) 2008-2022 Advanced Micro Devices, Inc. All rights reserved.
3230+
// Copyright (C) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
32313231
//
32323232
// Licensed under the BSD 3-Clause "New" or "Revised" License
32333233
// See THIRD-PARTY-NOTICES.TXT for the full license text
32343234

32353235
// Implementation Notes
32363236
// --------------------
32373237
// Based on the value of x, acos(x) is calculated as:
3238-
// For |x| <= 0.5: acos(x) = pi/2 - (x + x^3*R(x^2))
3239-
// For |x| > 0.5: use acos(x) = pi - 2*asin(sqrt((1-|x|)/2)) or 2*asin(sqrt((1-x)/2))
3240-
// where R(x^2) is a [5,4] rational minimax approximation (same as asin.c).
3241-
3242-
// Rational polynomial coefficients (same as asin.c / AMD acos.c)
3243-
const double C1 = 0.227485835556935010735943483075;
3244-
const double C2 = -0.445017216867635649900123110649;
3245-
const double C3 = 0.275558175256937652532686256258;
3246-
const double C4 = -0.0549989809235685841612020091328;
3247-
const double C5 = 0.00109242697235074662306043804220;
3248-
const double C6 = 0.0000482901920344786991880522822991;
3249-
3250-
const double D1 = 1.36491501334161032038194214209;
3251-
const double D2 = -3.28431505720958658909889444194;
3252-
const double D3 = 2.76568859157270989520376345954;
3253-
const double D4 = -0.943639137032492685763471240072;
3254-
const double D5 = 0.105869422087204370341222318533;
3255-
3256-
const double PI = 3.1415926535897933e+00; // 0x400921fb54442d18
3257-
const double PIBY2_HEAD = 1.5707963267948966e+00; // 0x3ff921fb54442d18
3258-
const double PIBY2_TAIL = 6.1232339957367660e-17; // 0x3c91a62633145c07
3238+
//
3239+
// 1. If x > 0.5: acos(x) = 2 * asin(sqrt((1 - x) / 2))
3240+
// 2. If x < -0.5: acos(x) = pi - 2 * asin(sqrt((1 + x) / 2))
3241+
// 3. If |x| <= 0.5: acos(x) = pi/2 - asin(x)
3242+
//
3243+
// asin(x) is approximated using the polynomial:
3244+
// x + C1*x^3 + C2*x^5 + ... + C12*x^25
3245+
3246+
// Polynomial coefficients obtained from Sollya
3247+
const double C1 = 0.166666666666647700; // 0x1.55555555552aap-3
3248+
const double C2 = 0.075000000004179696; // 0x1.333333337cbaep-4
3249+
const double C3 = 0.044642856781408560; // 0x1.6db6db3c0984p-5
3250+
const double C4 = 0.030381960650355640; // 0x1.f1c72dd86cbafp-6
3251+
const double C5 = 0.022371727970318958; // 0x1.6e89d3ff33aa4p-6
3252+
const double C6 = 0.017360094637841349; // 0x1.1c6d83ae664b6p-6
3253+
const double C7 = 0.013881842859634605; // 0x1.c6e1568b90518p-7
3254+
const double C8 = 0.012189191110336799; // 0x1.8f6a58977fe49p-7
3255+
const double C9 = 0.006449405266899452; // 0x1.a6ab10b3321bp-8
3256+
const double C10 = 0.019725887785684789; // 0x1.43305ebb2428fp-6
3257+
const double C11 = -0.016511752058748410; // -0x1.0e874ec5e3157p-6
3258+
const double C12 = 0.032096272998247702; // 0x1.06eec35b3b142p-5
3259+
3260+
const double PIBY2 = 1.5707963267948966; // 0x1.921fb54442d18p+0
3261+
const double PIBY4 = 0.78539816339744828; // 0x1.921fb54442d18p-1
32593262

32603263
TVectorDouble xneg = TVectorDouble.LessThan(x, TVectorDouble.Zero);
32613264
TVectorDouble ax = TVectorDouble.Abs(x);
32623265

3263-
TVectorDouble transformMask = TVectorDouble.GreaterThanOrEqual(ax, TVectorDouble.Create(0.5));
3264-
3265-
// For |x| >= 0.5: r = 0.5*(1-ax), s = sqrt(r)
3266-
// For |x| < 0.5: r = ax*ax
3267-
TVectorDouble r = TVectorDouble.ConditionalSelect(transformMask, TVectorDouble.Create(0.5) * (TVectorDouble.One - ax), ax * ax);
3268-
TVectorDouble s = TVectorDouble.Sqrt(r);
3269-
3270-
// Evaluate numerator: r*(C1 + r*(C2 + r*(C3 + r*(C4 + r*(C5 + r*C6)))))
3271-
TVectorDouble polyNum = TVectorDouble.Create(C6);
3272-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C5));
3273-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C4));
3274-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C3));
3275-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C2));
3276-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C1));
3277-
3278-
// Evaluate denominator: D1 + r*(D2 + r*(D3 + r*(D4 + r*D5)))
3279-
TVectorDouble polyDen = TVectorDouble.Create(D5);
3280-
polyDen = TVectorDouble.MultiplyAddEstimate(polyDen, r, TVectorDouble.Create(D4));
3281-
polyDen = TVectorDouble.MultiplyAddEstimate(polyDen, r, TVectorDouble.Create(D3));
3282-
polyDen = TVectorDouble.MultiplyAddEstimate(polyDen, r, TVectorDouble.Create(D2));
3283-
polyDen = TVectorDouble.MultiplyAddEstimate(polyDen, r, TVectorDouble.Create(D1));
3284-
3285-
// u = r * polyNum / polyDen
3286-
TVectorDouble u = r * polyNum / polyDen;
3287-
3288-
// For transform region (|x| >= 0.5):
3289-
// s1 = high part of s (clear low 32 bits for precision)
3290-
// c = (r - s1*s1) / (s + s1)
3291-
TVectorDouble s1 = Unsafe.BitCast<TVectorUInt64, TVectorDouble>(Unsafe.BitCast<TVectorDouble, TVectorUInt64>(s) & TVectorUInt64.Create(0xFFFFFFFF00000000));
3292-
TVectorDouble c = (r - s1 * s1) / (s + s1);
3293-
3294-
// For x < 0, |x| >= 0.5: acos(x) = pi - 2*(s + (s*u - piby2_tail))
3295-
TVectorDouble transformNeg = TVectorDouble.Create(PI) - TVectorDouble.Create(2.0) * (s + (s * u - TVectorDouble.Create(PIBY2_TAIL)));
3296-
// For x > 0, |x| >= 0.5: acos(x) = 2*s1 + (2*c + 2*s*u)
3297-
TVectorDouble transformPos = TVectorDouble.Create(2.0) * s1 + (TVectorDouble.Create(2.0) * c + TVectorDouble.Create(2.0) * s * u);
3298-
TVectorDouble vTransform = TVectorDouble.ConditionalSelect(xneg, transformNeg, transformPos);
3299-
3300-
// For |x| < 0.5: acos(x) = piby2_head - (x - (piby2_tail - x*u))
3301-
TVectorDouble vNormal = TVectorDouble.Create(PIBY2_HEAD) - (x - (TVectorDouble.Create(PIBY2_TAIL) - x * u));
3302-
3303-
TVectorDouble result = TVectorDouble.ConditionalSelect(transformMask, vTransform, vNormal);
3304-
3305-
// Handle special cases: |x| > 1 returns NaN, x = ±1 returns 0 or π
3266+
TVectorDouble gtHalf = TVectorDouble.GreaterThan(ax, TVectorDouble.Create(0.5));
3267+
3268+
// For |x| > 0.5: z = 0.5*(1-|x|), y = -2*sqrt(z)
3269+
// For |x| <= 0.5: z = |x|*|x|, y = |x|
3270+
TVectorDouble z = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(0.5) * (TVectorDouble.One - ax), ax * ax);
3271+
TVectorDouble y = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(-2.0) * TVectorDouble.Sqrt(z), ax);
3272+
3273+
// Evaluate polynomial: P(z) = C1 + z*(C2 + z*(C3 + ... + z*C12))
3274+
TVectorDouble poly = TVectorDouble.Create(C12);
3275+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C11));
3276+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C10));
3277+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C9));
3278+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C8));
3279+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C7));
3280+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C6));
3281+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C5));
3282+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C4));
3283+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C3));
3284+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C2));
3285+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C1));
3286+
3287+
// poly = y + y * z * P(z)
3288+
poly = y + y * z * poly;
3289+
3290+
// Reconstruct acos using split constants for precision:
3291+
// |x| > 0.5: A = 0, B = pi/2
3292+
// |x| <= 0.5: A = pi/4, B = pi/4
3293+
// positive x: result = (A - poly) + A
3294+
// negative x: result = (B + poly) + B
3295+
TVectorDouble aConst = TVectorDouble.Create(PIBY4) & ~gtHalf;
3296+
TVectorDouble bConst = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(PIBY2), TVectorDouble.Create(PIBY4));
3297+
3298+
TVectorDouble posResult = (aConst - poly) + aConst;
3299+
TVectorDouble negResult = (bConst + poly) + bConst;
3300+
3301+
TVectorDouble result = TVectorDouble.ConditionalSelect(xneg, negResult, posResult);
3302+
3303+
// Handle special cases: |x| > 1 returns NaN
33063304
TVectorDouble absXGreaterThanOne = TVectorDouble.GreaterThan(ax, TVectorDouble.One);
33073305
result = TVectorDouble.ConditionalSelect(absXGreaterThanOne, TVectorDouble.Create(double.NaN), result);
33083306

3309-
TVectorDouble xEqualsOne = TVectorDouble.Equals(x, TVectorDouble.One);
3310-
result = TVectorDouble.ConditionalSelect(xEqualsOne, TVectorDouble.Zero, result);
3311-
3312-
TVectorDouble xEqualsNegOne = TVectorDouble.Equals(x, TVectorDouble.Create(-1.0));
3313-
result = TVectorDouble.ConditionalSelect(xEqualsNegOne, TVectorDouble.Create(PI), result);
3314-
33153307
return result;
33163308
}
33173309

@@ -3357,51 +3349,48 @@ public static TVectorSingle AcosSingle<TVectorSingle, TVectorInt32, TVectorDoubl
33573349
private static TVectorDouble AcosSingleCoreDouble<TVectorDouble>(TVectorDouble dx)
33583350
where TVectorDouble : unmanaged, ISimdVector<TVectorDouble, double>
33593351
{
3360-
// Rational polynomial coefficients from AMD acosf.c (same as asinf.c)
3361-
const double C1 = 0.184161606965100694821398249421;
3362-
const double C2 = -0.0565298683201845211985026327361;
3363-
const double C3 = -0.0133819288943925804214011424456;
3364-
const double C4 = -0.00396137437848476485201154797087;
3365-
const double D1 = 1.10496961524520294485512696706;
3366-
const double D2 = -0.836411276854206731913362287293;
3367-
3368-
// High-precision pi constants
3369-
const double PI = 3.1415926535897933e+00; // 0x400921fb54442d18
3370-
const double PIBY2_HEAD = 1.5707963267948966e+00; // 0x3ff921fb54442d18
3371-
const double PIBY2_TAIL = 6.1232339957367660e-17; // 0x3c91a62633145c07
3352+
// Polynomial coefficients from Sollya (AMD aocl-libm-ose acosf.c)
3353+
const double C1 = 0.166667014360427856445; // 0x1.5555fcp-3
3354+
const double C2 = 0.074944347143173218; // 0x1.32f8d8p-4
3355+
const double C3 = 0.045550186187028885; // 0x1.7525aap-5
3356+
const double C4 = 0.023858169093728065; // 0x1.86e46ap-6
3357+
const double C5 = 0.042635641992092133; // 0x1.5d456cp-5
3358+
3359+
const double PIBY2 = 1.5707963267948966; // 0x1.921fb54442d18p+0
3360+
const double PIBY4 = 0.78539816339744828; // 0x1.921fb54442d18p-1
33723361

33733362
TVectorDouble xneg = TVectorDouble.LessThan(dx, TVectorDouble.Zero);
33743363
TVectorDouble ax = TVectorDouble.Abs(dx);
33753364

3376-
TVectorDouble transformMask = TVectorDouble.GreaterThanOrEqual(ax, TVectorDouble.Create(0.5));
3377-
3378-
// For |x| >= 0.5: r = 0.5*(1-ax), s = sqrt(r)
3379-
// For |x| < 0.5: r = ax*ax
3380-
TVectorDouble r = TVectorDouble.ConditionalSelect(transformMask, TVectorDouble.Create(0.5) * (TVectorDouble.One - ax), ax * ax);
3381-
TVectorDouble s = TVectorDouble.Sqrt(r);
3365+
TVectorDouble gtHalf = TVectorDouble.GreaterThan(ax, TVectorDouble.Create(0.5));
33823366

3383-
// Rational polynomial: u = r * (C1 + (C2 + (C3 + C4*r)*r)*r) / (D1 + D2*r)
3384-
TVectorDouble polyNum = TVectorDouble.Create(C4);
3385-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C3));
3386-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C2));
3387-
polyNum = TVectorDouble.MultiplyAddEstimate(polyNum, r, TVectorDouble.Create(C1));
3367+
// For |x| > 0.5: z = 0.5*(1-|x|), y = -2*sqrt(z)
3368+
// For |x| <= 0.5: z = |x|*|x|, y = |x|
3369+
TVectorDouble z = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(0.5) * (TVectorDouble.One - ax), ax * ax);
3370+
TVectorDouble y = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(-2.0) * TVectorDouble.Sqrt(z), ax);
33883371

3389-
TVectorDouble polyDen = TVectorDouble.MultiplyAddEstimate(TVectorDouble.Create(D2), r, TVectorDouble.Create(D1));
3372+
// Evaluate polynomial: P(z) = C1 + z*(C2 + z*(C3 + z*(C4 + z*C5)))
3373+
TVectorDouble poly = TVectorDouble.Create(C5);
3374+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C4));
3375+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C3));
3376+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C2));
3377+
poly = TVectorDouble.MultiplyAddEstimate(poly, z, TVectorDouble.Create(C1));
33903378

3391-
TVectorDouble u = r * polyNum / polyDen;
3379+
// poly = y + y * z * P(z)
3380+
poly = y + y * z * poly;
33923381

3393-
// For x < 0, |x| >= 0.5: acos(x) = pi - 2*(s + (s*u - piby2_tail))
3394-
// Since we're in double for float output, piby2_tail correction provides extra precision
3395-
TVectorDouble transformNeg = TVectorDouble.Create(PI) - TVectorDouble.Create(2.0) * (s + (s * u - TVectorDouble.Create(PIBY2_TAIL)));
3396-
// For x > 0, |x| >= 0.5: acos(x) = 2*s + 2*s*u
3397-
// (s1/c correction omitted: computing in double for float output, precision is sufficient)
3398-
TVectorDouble transformPos = TVectorDouble.Create(2.0) * s + TVectorDouble.Create(2.0) * s * u;
3399-
TVectorDouble vTransform = TVectorDouble.ConditionalSelect(xneg, transformNeg, transformPos);
3382+
// Reconstruct acos using split constants for precision:
3383+
// |x| > 0.5: A = 0, B = pi/2
3384+
// |x| <= 0.5: A = pi/4, B = pi/4
3385+
// positive x: result = (A - poly) + A
3386+
// negative x: result = (B + poly) + B
3387+
TVectorDouble aConst = TVectorDouble.Create(PIBY4) & ~gtHalf;
3388+
TVectorDouble bConst = TVectorDouble.ConditionalSelect(gtHalf, TVectorDouble.Create(PIBY2), TVectorDouble.Create(PIBY4));
34003389

3401-
// For |x| < 0.5: acos(x) = piby2_head - (x - (piby2_tail - x*u))
3402-
TVectorDouble vNormal = TVectorDouble.Create(PIBY2_HEAD) - (dx - (TVectorDouble.Create(PIBY2_TAIL) - dx * u));
3390+
TVectorDouble posResult = (aConst - poly) + aConst;
3391+
TVectorDouble negResult = (bConst + poly) + bConst;
34033392

3404-
return TVectorDouble.ConditionalSelect(transformMask, vTransform, vNormal);
3393+
return TVectorDouble.ConditionalSelect(xneg, negResult, posResult);
34053394
}
34063395
}
34073396
}

0 commit comments

Comments
 (0)