// // Lol Engine // // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net> // This program is free software; you can redistribute it and/or // modify it under the terms of the Do What The Fuck You Want To // Public License, Version 2, as published by Sam Hocevar. See // http://www.wtfpl.net/ for more details. // #if defined HAVE_CONFIG_H # include "config.h" #endif #if defined HAVE_FASTMATH_H # include <fastmath.h> #endif #include "core.h" using namespace std; namespace lol { static const double PI_2 = 1.57079632679489661923132; static const double PI_4 = 0.785398163397448309615661; static const double INV_PI = 0.318309886183790671537768; static const double ROOT3 = 1.73205080756887729352745; static const double ZERO = 0.0; static const double ONE = 1.0; static const double NEG_ONE = -1.0; static const double HALF = 0.5; static const double QUARTER = 0.25; static const double TWO = 2.0; #if defined __GNUC__ static const double VERY_SMALL_NUMBER = 0x1.0p-128; #else static const double VERY_SMALL_NUMBER = 3e-39; #endif static const double TWO_EXP_52 = 4503599627370496.0; static const double TWO_EXP_54 = 18014398509481984.0; /** sin Taylor series coefficients. */ static const double SC[] = { -1.6449340668482264364724e-0, // π^2/3! +8.1174242528335364363700e-1, // π^4/5! -1.9075182412208421369647e-1, // π^6/7! +2.6147847817654800504653e-2, // π^8/9! -2.3460810354558236375089e-3, // π^10/11! +1.4842879303107100368487e-4, // π^12/13! -6.9758736616563804745344e-6, // π^14/15! +2.5312174041370276513517e-7, // π^16/17! }; /* Note: the last value should be -1.3878952462213772114468e-7 (ie. * π^18/18!) but we tweak it in order to get the better average precision * required for tan() computations when close to π/2+kπ values. */ static const double CC[] = { -4.9348022005446793094172e-0, // π^2/2! +4.0587121264167682181850e-0, // π^4/4! -1.3352627688545894958753e-0, // π^6/6! +2.3533063035889320454188e-1, // π^8/8! -2.5806891390014060012598e-2, // π^10/10! +1.9295743094039230479033e-3, // π^12/12! -1.0463810492484570711802e-4, // π^14/14! +4.3030695870329470072978e-6, // π^16/16! -1.3777e-7, }; /* These coefficients use Sloane’s http://oeis.org/A002430 and * http://oeis.org/A036279 sequences for the Taylor series of tan(). * Note: the last value should be 2.12485922978838540352881e5 (ie. * 443861162*π^18/1856156927625), but we tweak it in order to get * sub 1e-11 average precision in a larger range. */ static const double TC[] = { 3.28986813369645287294483e0, // π^2/3 1.29878788045336582981920e1, // 2*π^4/15 5.18844961612069061254404e1, // 17*π^6/315 2.07509320280908496804928e2, // 62*π^8/2835 8.30024701695986756361561e2, // 1382*π^10/155925 3.32009324029001216460018e3, // 21844*π^12/6081075 1.32803704909665483598490e4, // 929569*π^14/638512875 5.31214808666037709352112e4, // 6404582*π^16/10854718875 2.373e5, }; #if defined __CELLOS_LV2__ static inline double lol_fctid(double x) INLINEATTR; static inline double lol_fctidz(double x) INLINEATTR; static inline double lol_fcfid(double x) INLINEATTR; static inline double lol_frsqrte(double x) INLINEATTR; static inline double lol_fsel(double c, double gte, double lt) INLINEATTR; static inline double lol_fres(double x) INLINEATTR; static inline double lol_fdiv(double a, double b) INLINEATTR; #endif static inline double lol_fabs(double x) INLINEATTR; #if defined __GNUC__ static inline double lol_round(double x) INLINEATTR; static inline double lol_trunc(double x) INLINEATTR; #endif #if defined __CELLOS_LV2__ static inline double lol_fctid(double x) { double r; #if defined __SNC__ r = __builtin_fctid(x); #else __asm__ ("fctid %0, %1" : "=f" (r) : "f" (x)); #endif return r; } static double lol_fctidz(double x) { double r; #if defined __SNC__ r = __builtin_fctidz(x); #else __asm__ ("fctidz %0, %1" : "=f" (r) : "f" (x)); #endif return r; } static double lol_fcfid(double x) { double r; #if defined __SNC__ r = __builtin_fcfid(x); #else __asm__ ("fcfid %0, %1" : "=f" (r) : "f" (x)); #endif return r; } static double lol_frsqrte(double x) { #if defined __SNC__ return __builtin_frsqrte(x); #else double r; __asm__ ("frsqrte %0, %1" : "=f" (r) : "f" (x)); return r; #endif } static inline double lol_fsel(double c, double gte, double lt) { #if defined __CELLOS_LV2__ && defined __SNC__ return __fsel(c, gte, lt); #elif defined __CELLOS_LV2__ double r; __asm__ ("fsel %0, %1, %2, %3" : "=f" (r) : "f" (c), "f" (gte), "f" (lt)); return r; #else return (c >= 0) ? gte : lt; #endif } static inline double lol_fres(double x) { double ret; #if defined __SNC__ ret = __builtin_fre(x); #else __asm__ ("fres %0, %1" : "=f" (ret) : "f" (x)); #endif return ret; } static inline double lol_fdiv(double a, double b) { /* Estimate */ double x0 = lol_fres(b); /* Two steps of Newton-Raphson */ x0 = (b * x0 - ONE) * -x0 + x0; x0 = (b * x0 - ONE) * -x0 + x0; return a * x0; } #endif /* __CELLOS_LV2__ */ static inline double lol_fabs(double x) { #if defined __CELLOS_LV2__ && defined __SNC__ return __fabs(x); #elif defined __CELLOS_LV2__ double r; __asm__ ("fabs %0, %1" : "=f" (r) : "f" (x)); return r; #elif defined __GNUC__ return __builtin_fabs(x); #else using std::fabs; return fabs(x); #endif } #if defined __GNUC__ static inline double lol_round(double x) { #if defined __CELLOS_LV2__ return lol_fcfid(lol_fctid(x)); #else return __builtin_round(x); #endif } static inline double lol_trunc(double x) { #if defined __CELLOS_LV2__ return lol_fcfid(lol_fctidz(x)); #else return __builtin_trunc(x); #endif } #endif double lol_sin(double x) { double absx = lol_fabs(x * INV_PI); /* If branches are cheap, skip the cycle count when |x| < π/4, * and only do the Taylor series up to the required precision. */ #if defined LOL_FEATURE_CHEAP_BRANCHES if (absx < QUARTER) { /* Computing x^4 is one multiplication too many we do, but it helps * interleave the Taylor series operations a lot better. */ double x2 = absx * absx; double x4 = x2 * x2; double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE; double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; double taylor = sub2 * x2 + sub1; return x * taylor; } #endif /* Wrap |x| to the range [-1, 1] and keep track of the number of * cycles required. If odd, we'll need to change the sign of the * result. */ #if defined __CELLOS_LV2__ double sign = lol_fsel(x, D_PI, -D_PI); double num_cycles = lol_round(absx); double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); sign = lol_fsel(is_even, sign, -sign); #else double num_cycles = absx + TWO_EXP_52; FP_USE(num_cycles); num_cycles -= TWO_EXP_52; double is_even = TWO * num_cycles - ONE; FP_USE(is_even); is_even += TWO_EXP_54; FP_USE(is_even); is_even -= TWO_EXP_54; FP_USE(is_even); is_even -= TWO * num_cycles - ONE; double sign = is_even; #endif absx -= num_cycles; /* If branches are very cheap, we have the option to do the Taylor * series at a much lower degree by splitting. */ #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES if (lol_fabs(absx) > QUARTER) { sign = (x * absx >= 0.0) ? sign : -sign; double x1 = HALF - lol_fabs(absx); double x2 = x1 * x1; double x4 = x2 * x2; double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; double taylor = sub2 * x2 + sub1; return taylor * sign; } #endif #if !defined __CELLOS_LV2__ sign *= (x >= 0.0) ? D_PI : -D_PI; #endif /* Compute a Tailor series for sin() and combine sign information. */ double x2 = absx * absx; double x4 = x2 * x2; #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE; double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; #else double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE; double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0]; #endif double taylor = sub2 * x2 + sub1; return absx * taylor * sign; } double lol_cos(double x) { double absx = lol_fabs(x * INV_PI); #if defined LOL_FEATURE_CHEAP_BRANCHES if (absx < QUARTER) { double x2 = absx * absx; double x4 = x2 * x2; double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; double taylor = (sub1 * x2 + sub2) * x2 + ONE; return taylor; } #endif #if defined __CELLOS_LV2__ double num_cycles = lol_round(absx); double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); double sign = lol_fsel(is_even, ONE, NEG_ONE); #else double num_cycles = absx + TWO_EXP_52; FP_USE(num_cycles); num_cycles -= TWO_EXP_52; double is_even = TWO * num_cycles - ONE; FP_USE(is_even); is_even += TWO_EXP_54; FP_USE(is_even); is_even -= TWO_EXP_54; FP_USE(is_even); is_even -= TWO * num_cycles - ONE; double sign = is_even; #endif absx -= num_cycles; #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES if (lol_fabs(absx) > QUARTER) { double x1 = HALF - lol_fabs(absx); double x2 = x1 * x1; double x4 = x2 * x2; double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE; double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; double taylor = sub2 * x2 + sub1; return x1 * taylor * sign * D_PI; } #endif double x2 = absx * absx; double x4 = x2 * x2; #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; #else double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0]; #endif double taylor = sub2 * x2 + sub1; return taylor * sign; } void lol_sincos(double x, double *sinx, double *cosx) { double absx = lol_fabs(x * INV_PI); #if defined LOL_FEATURE_CHEAP_BRANCHES if (absx < QUARTER) { double x2 = absx * absx; double x4 = x2 * x2; /* Computing the Taylor series to the 11th order is enough to get * x * 1e-11 precision, but we push it to the 13th order so that * tan() has a better precision. */ double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE; double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; double taylors = subs2 * x2 + subs1; *sinx = x * taylors; double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; double taylorc = (subc1 * x2 + subc2) * x2 + ONE; *cosx = taylorc; return; } #endif #if defined __CELLOS_LV2__ double num_cycles = lol_round(absx); double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); double sin_sign = lol_fsel(x, D_PI, -D_PI); sin_sign = lol_fsel(is_even, sin_sign, -sin_sign); double cos_sign = lol_fsel(is_even, ONE, NEG_ONE); #else double num_cycles = absx + TWO_EXP_52; FP_USE(num_cycles); num_cycles -= TWO_EXP_52; double is_even = TWO * num_cycles - ONE; FP_USE(is_even); is_even += TWO_EXP_54; FP_USE(is_even); is_even -= TWO_EXP_54; FP_USE(is_even); is_even -= TWO * num_cycles - ONE; double sin_sign = is_even; double cos_sign = is_even; #endif absx -= num_cycles; #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES if (lol_fabs(absx) > QUARTER) { cos_sign = sin_sign; sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign; double x1 = HALF - lol_fabs(absx); double x2 = x1 * x1; double x4 = x2 * x2; double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; double taylors = subs2 * x2 + subs1; *sinx = taylors * sin_sign; double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE; double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; double taylorc = subc2 * x2 + subc1; *cosx = x1 * taylorc * cos_sign * D_PI; return; } #endif #if !defined __CELLOS_LV2__ sin_sign *= (x >= 0.0) ? D_PI : -D_PI; #endif double x2 = absx * absx; double x4 = x2 * x2; #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE; double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; #else double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE; double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0]; /* Push Taylor series to the 19th order to enhance tan() accuracy. */ double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE; double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0]; #endif double taylors = subs2 * x2 + subs1; *sinx = absx * taylors * sin_sign; double taylorc = subc2 * x2 + subc1; *cosx = taylorc * cos_sign; } void lol_sincos(float x, float *sinx, float *cosx) { double x2 = static_cast<double>(x); double s2, c2; lol_sincos(x2, &s2, &c2); *sinx = static_cast<float>(s2); *cosx = static_cast<float>(c2); } double lol_tan(double x) { #if defined LOL_FEATURE_CHEAP_BRANCHES double absx = lol_fabs(x * INV_PI); /* This value was determined empirically to ensure an error of no * more than x * 1e-11 in this range. */ if (absx < 0.163) { double x2 = absx * absx; double x4 = x2 * x2; double sub1 = (((TC[7] * x4 + TC[5]) * x4 + TC[3]) * x4 + TC[1]) * x4 + ONE; double sub2 = (((TC[8] * x4 + TC[6]) * x4 + TC[4]) * x4 + TC[2]) * x4 + TC[0]; double taylor = sub2 * x2 + sub1; return x * taylor; } #endif double sinx, cosx; lol_sincos(x, &sinx, &cosx); /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */ double absc = lol_fabs(cosx); #if defined __CELLOS_LV2__ double is_cos_not_zero = absc - VERY_SMALL_NUMBER; cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER); return lol_fdiv(sinx, cosx); #else if (__unlikely(absc < VERY_SMALL_NUMBER)) cosx = VERY_SMALL_NUMBER; return sinx / cosx; #endif } } /* namespace lol */