From a65c472ffdfcad26f67b990f564f416e3e1db588 Mon Sep 17 00:00:00 2001 From: Sam Hocevar Date: Sat, 3 Sep 2011 17:45:16 +0000 Subject: [PATCH] core: add a code shortcut for sin() on platforms that have cheap branches. --- src/core.h | 5 ++++ src/trig.cpp | 82 +++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 17 deletions(-) diff --git a/src/core.h b/src/core.h index 06387b30..ebd7de92 100644 --- a/src/core.h +++ b/src/core.h @@ -16,6 +16,11 @@ #if !defined __LOL_CORE_H__ #define __LOL_CORE_H__ +// CPU features +#if !defined __CELLOS_LV2__ +# define LOL_FEATURE_CHEAP_BRANCHES +#endif + // Base types #include "trig.h" #include "half.h" diff --git a/src/trig.cpp b/src/trig.cpp index 8884c224..8a397dc3 100644 --- a/src/trig.cpp +++ b/src/trig.cpp @@ -23,24 +23,25 @@ using namespace std; namespace lol { -static const double PI = 3.14159265358979323846; -static const double NEG_PI = -3.14159265358979323846; -static const double PI_2 = PI / 2.0; -static const double PI_4 = PI / 4.0; -static const double INV_PI = 1.0 / PI; -static const double ROOT3 = 1.732050807568877293527; +static const double PI = 3.14159265358979323846264; +static const double NEG_PI = -3.14159265358979323846264; +static const double PI_2 = 1.57079632679489661923132; +static const double PI_4 = 0.785398163397448309615661; +static const double INV_PI = 0.318309886183790671537768; +static const double ROOT3 = 1.73205080756887729352745; static const double ZERO = 0.0; static const double ONE = 1.0; static const double NEG_ONE = -1.0; static const double HALF = 0.5; +static const double QUARTER = 0.25; static const double TWO = 2.0; static const double VERY_SMALL_NUMBER = 0x1.0p-128; static const double TWO_EXP_52 = 4503599627370496.0; static const double TWO_EXP_54 = 18014398509481984.0; /** sin Taylor series coefficients. */ -static const double SC[] = +static const double SC[] = { -1.6449340668482264364724e-0, // pi^2/3! +8.1174242528335364363700e-1, // pi^4/5! @@ -52,6 +53,18 @@ static const double SC[] = +2.5312174041370276513517e-7, // pi^16/17! }; +static const double CC[] = +{ + -4.9348022005446793094172e-0, // pi^2/2! + +4.0587121264167682181850e-0, // pi^4/4! + -1.3352627688545894958753e-0, // pi^6/6! + +2.3533063035889320454188e-1, // pi^8/8! + -2.5806891390014060012598e-2, // pi^10/10! + +1.9295743094039230479033e-3, // pi^12/12! + -1.0463810492484570711802e-4, // pi^14/14! + +4.3030695870329470072978e-6, // pi^16/16! +}; + /* Custom intrinsics */ #define INLINEATTR __attribute__((always_inline)) @@ -167,16 +180,33 @@ static inline double lol_trunc(double x) double lol_sin(double x) { double absx = lol_fabs(x * INV_PI); - double sign = lol_fsel(x, PI, NEG_PI); - /* To compute sin(x) we build a Taylor series for |x|/pi wrapped to - * the range [-1, 1]. We also switch the result sign if the number - * of cycles is odd. */ + /* If branches are cheap, skip the cycle count when |x| < π/4, + * and only do the Taylor series up to the required precision. */ +#if defined LOL_FEATURE_CHEAP_BRANCHES + if (absx < QUARTER) + { + /* Computing x^4 is one multiplication too many we do, but it helps + * interleave the Taylor series operations a lot better. */ + double x2 = absx * absx; + double x4 = x2 * x2; + double sub1 = SC[3] * x4 + SC[1]; + double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; + double taylor = (sub1 * x2 + sub2) * x2 + ONE; + return x * taylor; + } +#endif + + /* Wrap |x| to the range [-1, 1] and keep track of the number of + * cycles required. If odd, we'll need to change the sign of the + * result. */ #if defined __CELLOS_LV2__ + double sign = lol_fsel(x, PI, NEG_PI); double num_cycles = lol_round(absx); double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF); sign = lol_fsel(is_even, sign, -sign); #else + double sign = (x >= 0.0) ? PI : NEG_PI; double num_cycles = absx + TWO_EXP_52; __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52; @@ -187,18 +217,36 @@ double lol_sin(double x) is_even -= TWO * num_cycles - ONE; sign *= is_even; #endif - double norm_x = absx - num_cycles; + absx -= num_cycles; + +#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES + if (lol_fabs(absx) > QUARTER) + { + sign = (x * absx >= 0.0) ? is_even : -is_even; + + double k = HALF - lol_fabs(absx); + double x2 = k * k; + double x4 = x2 * x2; + double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1]; + double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0]; + double taylor = (sub1 * x2 + sub2) * x2 + ONE; + + return taylor * sign; + } +#endif - /* Computing x^4 is one multiplication too many we do, but it helps - * interleave the Taylor series operations a lot better. */ - double x2 = norm_x * norm_x; + double x2 = absx * absx; double x4 = x2 * x2; +#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES + double sub1 = SC[3] * x4 + SC[1]; + double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0]; +#else double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]; double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0]; +#endif double taylor = (sub1 * x2 + sub2) * x2 + ONE; - double result = norm_x * taylor; - return result * sign; + return absx * taylor * sign; } } /* namespace lol */