@@ -3227,91 +3227,83 @@ public static TVectorDouble AcosDouble<TVectorDouble, TVectorUInt64>(TVectorDoub
32273227 where TVectorUInt64 : unmanaged, ISimdVector < TVectorUInt64 , ulong >
32283228 {
32293229 // This code is based on `acos` from amd/aocl-libm-ose
3230- // Copyright (C) 2008 -2022 Advanced Micro Devices, Inc. All rights reserved.
3230+ // Copyright (C) 2021 -2022 Advanced Micro Devices, Inc. All rights reserved.
32313231 //
32323232 // Licensed under the BSD 3-Clause "New" or "Revised" License
32333233 // See THIRD-PARTY-NOTICES.TXT for the full license text
32343234
32353235 // Implementation Notes
32363236 // --------------------
32373237 // Based on the value of x, acos(x) is calculated as:
3238- // For |x| <= 0.5: acos(x) = pi/2 - (x + x^3*R(x^2))
3239- // For |x| > 0.5: use acos(x) = pi - 2*asin(sqrt((1-|x|)/2)) or 2*asin(sqrt((1-x)/2))
3240- // where R(x^2) is a [5,4] rational minimax approximation (same as asin.c).
3241-
3242- // Rational polynomial coefficients (same as asin.c / AMD acos.c)
3243- const double C1 = 0.227485835556935010735943483075 ;
3244- const double C2 = - 0.445017216867635649900123110649 ;
3245- const double C3 = 0.275558175256937652532686256258 ;
3246- const double C4 = - 0.0549989809235685841612020091328 ;
3247- const double C5 = 0.00109242697235074662306043804220 ;
3248- const double C6 = 0.0000482901920344786991880522822991 ;
3249-
3250- const double D1 = 1.36491501334161032038194214209 ;
3251- const double D2 = - 3.28431505720958658909889444194 ;
3252- const double D3 = 2.76568859157270989520376345954 ;
3253- const double D4 = - 0.943639137032492685763471240072 ;
3254- const double D5 = 0.105869422087204370341222318533 ;
3255-
3256- const double PI = 3.1415926535897933e+00 ; // 0x400921fb54442d18
3257- const double PIBY2_HEAD = 1.5707963267948966e+00 ; // 0x3ff921fb54442d18
3258- const double PIBY2_TAIL = 6.1232339957367660e-17 ; // 0x3c91a62633145c07
3238+ //
3239+ // 1. If x > 0.5: acos(x) = 2 * asin(sqrt((1 - x) / 2))
3240+ // 2. If x < -0.5: acos(x) = pi - 2 * asin(sqrt((1 + x) / 2))
3241+ // 3. If |x| <= 0.5: acos(x) = pi/2 - asin(x)
3242+ //
3243+ // asin(x) is approximated using the polynomial:
3244+ // x + C1*x^3 + C2*x^5 + ... + C12*x^25
3245+
3246+ // Polynomial coefficients obtained from Sollya
3247+ const double C1 = 0.166666666666647700 ; // 0x1.55555555552aap-3
3248+ const double C2 = 0.075000000004179696 ; // 0x1.333333337cbaep-4
3249+ const double C3 = 0.044642856781408560 ; // 0x1.6db6db3c0984p-5
3250+ const double C4 = 0.030381960650355640 ; // 0x1.f1c72dd86cbafp-6
3251+ const double C5 = 0.022371727970318958 ; // 0x1.6e89d3ff33aa4p-6
3252+ const double C6 = 0.017360094637841349 ; // 0x1.1c6d83ae664b6p-6
3253+ const double C7 = 0.013881842859634605 ; // 0x1.c6e1568b90518p-7
3254+ const double C8 = 0.012189191110336799 ; // 0x1.8f6a58977fe49p-7
3255+ const double C9 = 0.006449405266899452 ; // 0x1.a6ab10b3321bp-8
3256+ const double C10 = 0.019725887785684789 ; // 0x1.43305ebb2428fp-6
3257+ const double C11 = - 0.016511752058748410 ; // -0x1.0e874ec5e3157p-6
3258+ const double C12 = 0.032096272998247702 ; // 0x1.06eec35b3b142p-5
3259+
3260+ const double PIBY2 = 1.5707963267948966 ; // 0x1.921fb54442d18p+0
3261+ const double PIBY4 = 0.78539816339744828 ; // 0x1.921fb54442d18p-1
32593262
32603263 TVectorDouble xneg = TVectorDouble . LessThan ( x , TVectorDouble . Zero ) ;
32613264 TVectorDouble ax = TVectorDouble . Abs ( x ) ;
32623265
3263- TVectorDouble transformMask = TVectorDouble . GreaterThanOrEqual ( ax , TVectorDouble . Create ( 0.5 ) ) ;
3264-
3265- // For |x| >= 0.5: r = 0.5*(1-ax), s = sqrt(r)
3266- // For |x| < 0.5: r = ax*ax
3267- TVectorDouble r = TVectorDouble . ConditionalSelect ( transformMask , TVectorDouble . Create ( 0.5 ) * ( TVectorDouble . One - ax ) , ax * ax ) ;
3268- TVectorDouble s = TVectorDouble . Sqrt ( r ) ;
3269-
3270- // Evaluate numerator: r*(C1 + r*(C2 + r*(C3 + r*(C4 + r*(C5 + r*C6)))))
3271- TVectorDouble polyNum = TVectorDouble . Create ( C6 ) ;
3272- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C5 ) ) ;
3273- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C4 ) ) ;
3274- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C3 ) ) ;
3275- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C2 ) ) ;
3276- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C1 ) ) ;
3277-
3278- // Evaluate denominator: D1 + r*(D2 + r*(D3 + r*(D4 + r*D5)))
3279- TVectorDouble polyDen = TVectorDouble . Create ( D5 ) ;
3280- polyDen = TVectorDouble . MultiplyAddEstimate ( polyDen , r , TVectorDouble . Create ( D4 ) ) ;
3281- polyDen = TVectorDouble . MultiplyAddEstimate ( polyDen , r , TVectorDouble . Create ( D3 ) ) ;
3282- polyDen = TVectorDouble . MultiplyAddEstimate ( polyDen , r , TVectorDouble . Create ( D2 ) ) ;
3283- polyDen = TVectorDouble . MultiplyAddEstimate ( polyDen , r , TVectorDouble . Create ( D1 ) ) ;
3284-
3285- // u = r * polyNum / polyDen
3286- TVectorDouble u = r * polyNum / polyDen ;
3287-
3288- // For transform region (|x| >= 0.5):
3289- // s1 = high part of s (clear low 32 bits for precision)
3290- // c = (r - s1*s1) / (s + s1)
3291- TVectorDouble s1 = Unsafe . BitCast < TVectorUInt64 , TVectorDouble > ( Unsafe . BitCast < TVectorDouble , TVectorUInt64 > ( s ) & TVectorUInt64 . Create ( 0xFFFFFFFF00000000 ) ) ;
3292- TVectorDouble c = ( r - s1 * s1 ) / ( s + s1 ) ;
3293-
3294- // For x < 0, |x| >= 0.5: acos(x) = pi - 2*(s + (s*u - piby2_tail))
3295- TVectorDouble transformNeg = TVectorDouble . Create ( PI ) - TVectorDouble . Create ( 2.0 ) * ( s + ( s * u - TVectorDouble . Create ( PIBY2_TAIL ) ) ) ;
3296- // For x > 0, |x| >= 0.5: acos(x) = 2*s1 + (2*c + 2*s*u)
3297- TVectorDouble transformPos = TVectorDouble . Create ( 2.0 ) * s1 + ( TVectorDouble . Create ( 2.0 ) * c + TVectorDouble . Create ( 2.0 ) * s * u ) ;
3298- TVectorDouble vTransform = TVectorDouble . ConditionalSelect ( xneg , transformNeg , transformPos ) ;
3299-
3300- // For |x| < 0.5: acos(x) = piby2_head - (x - (piby2_tail - x*u))
3301- TVectorDouble vNormal = TVectorDouble . Create ( PIBY2_HEAD ) - ( x - ( TVectorDouble . Create ( PIBY2_TAIL ) - x * u ) ) ;
3302-
3303- TVectorDouble result = TVectorDouble . ConditionalSelect ( transformMask , vTransform , vNormal ) ;
3304-
3305- // Handle special cases: |x| > 1 returns NaN, x = ±1 returns 0 or π
3266+ TVectorDouble gtHalf = TVectorDouble . GreaterThan ( ax , TVectorDouble . Create ( 0.5 ) ) ;
3267+
3268+ // For |x| > 0.5: z = 0.5*(1-|x|), y = -2*sqrt(z)
3269+ // For |x| <= 0.5: z = |x|*|x|, y = |x|
3270+ TVectorDouble z = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( 0.5 ) * ( TVectorDouble . One - ax ) , ax * ax ) ;
3271+ TVectorDouble y = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( - 2.0 ) * TVectorDouble . Sqrt ( z ) , ax ) ;
3272+
3273+ // Evaluate polynomial: P(z) = C1 + z*(C2 + z*(C3 + ... + z*C12))
3274+ TVectorDouble poly = TVectorDouble . Create ( C12 ) ;
3275+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C11 ) ) ;
3276+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C10 ) ) ;
3277+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C9 ) ) ;
3278+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C8 ) ) ;
3279+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C7 ) ) ;
3280+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C6 ) ) ;
3281+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C5 ) ) ;
3282+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C4 ) ) ;
3283+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C3 ) ) ;
3284+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C2 ) ) ;
3285+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C1 ) ) ;
3286+
3287+ // poly = y + y * z * P(z)
3288+ poly = y + y * z * poly ;
3289+
3290+ // Reconstruct acos using split constants for precision:
3291+ // |x| > 0.5: A = 0, B = pi/2
3292+ // |x| <= 0.5: A = pi/4, B = pi/4
3293+ // positive x: result = (A - poly) + A
3294+ // negative x: result = (B + poly) + B
3295+ TVectorDouble aConst = TVectorDouble . Create ( PIBY4 ) & ~ gtHalf ;
3296+ TVectorDouble bConst = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( PIBY2 ) , TVectorDouble . Create ( PIBY4 ) ) ;
3297+
3298+ TVectorDouble posResult = ( aConst - poly ) + aConst ;
3299+ TVectorDouble negResult = ( bConst + poly ) + bConst ;
3300+
3301+ TVectorDouble result = TVectorDouble . ConditionalSelect ( xneg , negResult , posResult ) ;
3302+
3303+ // Handle special cases: |x| > 1 returns NaN
33063304 TVectorDouble absXGreaterThanOne = TVectorDouble . GreaterThan ( ax , TVectorDouble . One ) ;
33073305 result = TVectorDouble . ConditionalSelect ( absXGreaterThanOne , TVectorDouble . Create ( double . NaN ) , result ) ;
33083306
3309- TVectorDouble xEqualsOne = TVectorDouble . Equals ( x , TVectorDouble . One ) ;
3310- result = TVectorDouble . ConditionalSelect ( xEqualsOne , TVectorDouble . Zero , result ) ;
3311-
3312- TVectorDouble xEqualsNegOne = TVectorDouble . Equals ( x , TVectorDouble . Create ( - 1.0 ) ) ;
3313- result = TVectorDouble . ConditionalSelect ( xEqualsNegOne , TVectorDouble . Create ( PI ) , result ) ;
3314-
33153307 return result ;
33163308 }
33173309
@@ -3357,51 +3349,48 @@ public static TVectorSingle AcosSingle<TVectorSingle, TVectorInt32, TVectorDoubl
33573349 private static TVectorDouble AcosSingleCoreDouble < TVectorDouble > ( TVectorDouble dx )
33583350 where TVectorDouble : unmanaged, ISimdVector < TVectorDouble , double >
33593351 {
3360- // Rational polynomial coefficients from AMD acosf.c (same as asinf.c)
3361- const double C1 = 0.184161606965100694821398249421 ;
3362- const double C2 = - 0.0565298683201845211985026327361 ;
3363- const double C3 = - 0.0133819288943925804214011424456 ;
3364- const double C4 = - 0.00396137437848476485201154797087 ;
3365- const double D1 = 1.10496961524520294485512696706 ;
3366- const double D2 = - 0.836411276854206731913362287293 ;
3367-
3368- // High-precision pi constants
3369- const double PI = 3.1415926535897933e+00 ; // 0x400921fb54442d18
3370- const double PIBY2_HEAD = 1.5707963267948966e+00 ; // 0x3ff921fb54442d18
3371- const double PIBY2_TAIL = 6.1232339957367660e-17 ; // 0x3c91a62633145c07
3352+ // Polynomial coefficients from Sollya (AMD aocl-libm-ose acosf.c)
3353+ const double C1 = 0.166667014360427856445 ; // 0x1.5555fcp-3
3354+ const double C2 = 0.074944347143173218 ; // 0x1.32f8d8p-4
3355+ const double C3 = 0.045550186187028885 ; // 0x1.7525aap-5
3356+ const double C4 = 0.023858169093728065 ; // 0x1.86e46ap-6
3357+ const double C5 = 0.042635641992092133 ; // 0x1.5d456cp-5
3358+
3359+ const double PIBY2 = 1.5707963267948966 ; // 0x1.921fb54442d18p+0
3360+ const double PIBY4 = 0.78539816339744828 ; // 0x1.921fb54442d18p-1
33723361
33733362 TVectorDouble xneg = TVectorDouble . LessThan ( dx , TVectorDouble . Zero ) ;
33743363 TVectorDouble ax = TVectorDouble . Abs ( dx ) ;
33753364
3376- TVectorDouble transformMask = TVectorDouble . GreaterThanOrEqual ( ax , TVectorDouble . Create ( 0.5 ) ) ;
3377-
3378- // For |x| >= 0.5: r = 0.5*(1-ax), s = sqrt(r)
3379- // For |x| < 0.5: r = ax*ax
3380- TVectorDouble r = TVectorDouble . ConditionalSelect ( transformMask , TVectorDouble . Create ( 0.5 ) * ( TVectorDouble . One - ax ) , ax * ax ) ;
3381- TVectorDouble s = TVectorDouble . Sqrt ( r ) ;
3365+ TVectorDouble gtHalf = TVectorDouble . GreaterThan ( ax , TVectorDouble . Create ( 0.5 ) ) ;
33823366
3383- // Rational polynomial: u = r * (C1 + (C2 + (C3 + C4*r)*r)*r) / (D1 + D2*r)
3384- TVectorDouble polyNum = TVectorDouble . Create ( C4 ) ;
3385- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C3 ) ) ;
3386- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C2 ) ) ;
3387- polyNum = TVectorDouble . MultiplyAddEstimate ( polyNum , r , TVectorDouble . Create ( C1 ) ) ;
3367+ // For |x| > 0.5: z = 0.5*(1-|x|), y = -2*sqrt(z)
3368+ // For |x| <= 0.5: z = |x|*|x|, y = |x|
3369+ TVectorDouble z = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( 0.5 ) * ( TVectorDouble . One - ax ) , ax * ax ) ;
3370+ TVectorDouble y = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( - 2.0 ) * TVectorDouble . Sqrt ( z ) , ax ) ;
33883371
3389- TVectorDouble polyDen = TVectorDouble . MultiplyAddEstimate ( TVectorDouble . Create ( D2 ) , r , TVectorDouble . Create ( D1 ) ) ;
3372+ // Evaluate polynomial: P(z) = C1 + z*(C2 + z*(C3 + z*(C4 + z*C5)))
3373+ TVectorDouble poly = TVectorDouble . Create ( C5 ) ;
3374+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C4 ) ) ;
3375+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C3 ) ) ;
3376+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C2 ) ) ;
3377+ poly = TVectorDouble . MultiplyAddEstimate ( poly , z , TVectorDouble . Create ( C1 ) ) ;
33903378
3391- TVectorDouble u = r * polyNum / polyDen ;
3379+ // poly = y + y * z * P(z)
3380+ poly = y + y * z * poly ;
33923381
3393- // For x < 0, |x| >= 0.5: acos(x) = pi - 2*(s + (s*u - piby2_tail))
3394- // Since we're in double for float output, piby2_tail correction provides extra precision
3395- TVectorDouble transformNeg = TVectorDouble . Create ( PI ) - TVectorDouble . Create ( 2.0 ) * ( s + ( s * u - TVectorDouble . Create ( PIBY2_TAIL ) ) ) ;
3396- // For x > 0, |x| >= 0.5: acos(x) = 2*s + 2*s*u
3397- // (s1/c correction omitted: computing in double for float output, precision is sufficient)
3398- TVectorDouble transformPos = TVectorDouble . Create ( 2.0 ) * s + TVectorDouble . Create ( 2.0 ) * s * u ;
3399- TVectorDouble vTransform = TVectorDouble . ConditionalSelect ( xneg , transformNeg , transformPos ) ;
3382+ // Reconstruct acos using split constants for precision:
3383+ // |x| > 0.5: A = 0, B = pi/2
3384+ // |x| <= 0.5: A = pi/4, B = pi/4
3385+ // positive x: result = (A - poly) + A
3386+ // negative x: result = (B + poly) + B
3387+ TVectorDouble aConst = TVectorDouble . Create ( PIBY4 ) & ~ gtHalf ;
3388+ TVectorDouble bConst = TVectorDouble . ConditionalSelect ( gtHalf , TVectorDouble . Create ( PIBY2 ) , TVectorDouble . Create ( PIBY4 ) ) ;
34003389
3401- // For |x| < 0.5: acos(x) = piby2_head - (x - (piby2_tail - x*u))
3402- TVectorDouble vNormal = TVectorDouble . Create ( PIBY2_HEAD ) - ( dx - ( TVectorDouble . Create ( PIBY2_TAIL ) - dx * u ) ) ;
3390+ TVectorDouble posResult = ( aConst - poly ) + aConst ;
3391+ TVectorDouble negResult = ( bConst + poly ) + bConst ;
34033392
3404- return TVectorDouble . ConditionalSelect ( transformMask , vTransform , vNormal ) ;
3393+ return TVectorDouble . ConditionalSelect ( xneg , negResult , posResult ) ;
34053394 }
34063395 }
34073396}
0 commit comments