core: add a code shortcut for sin() on platforms that have cheap branches.

13 vuotta sitten · a65c472ffd
--- a/src/core.h
+++ b/src/core.h
@@ -16,6 +16,11 @@
 #if !defined __LOL_CORE_H__
 #define __LOL_CORE_H__

 // CPU features
 #if !defined __CELLOS_LV2__
 #   define LOL_FEATURE_CHEAP_BRANCHES
 #endif

 // Base types
 #include "trig.h"
 #include "half.h"
--- a/src/trig.cpp
+++ b/src/trig.cpp
@@ -23,24 +23,25 @@ using namespace std;
 namespace lol
 {

 static const double PI     = 3.14159265358979323846;
 static const double NEG_PI = -3.14159265358979323846;
 static const double PI_2   = PI / 2.0;
 static const double PI_4   = PI / 4.0;
 static const double INV_PI = 1.0 / PI;
 static const double ROOT3  = 1.732050807568877293527;
 static const double PI     = 3.14159265358979323846264;
 static const double NEG_PI = -3.14159265358979323846264;
 static const double PI_2   = 1.57079632679489661923132;
 static const double PI_4   = 0.785398163397448309615661;
 static const double INV_PI = 0.318309886183790671537768;
 static const double ROOT3  = 1.73205080756887729352745;

 static const double ZERO    = 0.0;
 static const double ONE     = 1.0;
 static const double NEG_ONE = -1.0;
 static const double HALF    = 0.5;
 static const double QUARTER = 0.25;
 static const double TWO     = 2.0;
 static const double VERY_SMALL_NUMBER = 0x1.0p-128;
 static const double TWO_EXP_52 = 4503599627370496.0;
 static const double TWO_EXP_54 = 18014398509481984.0;

 /** sin Taylor series coefficients. */
 static const double SC[]    =
 static const double SC[] =
 {
    -1.6449340668482264364724e-0, // pi^2/3!
    +8.1174242528335364363700e-1, // pi^4/5!
@@ -52,6 +53,18 @@ static const double SC[]    =
    +2.5312174041370276513517e-7, // pi^16/17!
 };

 static const double CC[] =
 {
    -4.9348022005446793094172e-0, // pi^2/2!
    +4.0587121264167682181850e-0, // pi^4/4!
    -1.3352627688545894958753e-0, // pi^6/6!
    +2.3533063035889320454188e-1, // pi^8/8!
    -2.5806891390014060012598e-2, // pi^10/10!
    +1.9295743094039230479033e-3, // pi^12/12!
    -1.0463810492484570711802e-4, // pi^14/14!
    +4.3030695870329470072978e-6, // pi^16/16!
 };

 /* Custom intrinsics */
 #define INLINEATTR __attribute__((always_inline))

@@ -167,16 +180,33 @@ static inline double lol_trunc(double x)
 double lol_sin(double x)
 {
    double absx = lol_fabs(x * INV_PI);
    double sign = lol_fsel(x, PI, NEG_PI);

    /* To compute sin(x) we build a Taylor series for |x|/pi wrapped to
     * the range [-1, 1]. We also switch the result sign if the number
     * of cycles is odd. */
    /* If branches are cheap, skip the cycle count when |x| < π/4,
     * and only do the Taylor series up to the required precision. */
 #if defined LOL_FEATURE_CHEAP_BRANCHES
    if (absx < QUARTER)
    {
        /* Computing x^4 is one multiplication too many we do, but it helps
         * interleave the Taylor series operations a lot better. */
        double x2 = absx * absx;
        double x4 = x2 * x2;
        double sub1 = SC[3] * x4 + SC[1];
        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
        return x * taylor;
    }
 #endif

    /* Wrap |x| to the range [-1, 1] and keep track of the number of
     * cycles required. If odd, we'll need to change the sign of the
     * result. */
 #if defined __CELLOS_LV2__
    double sign = lol_fsel(x, PI, NEG_PI);
    double num_cycles = lol_round(absx);
    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
    sign = lol_fsel(is_even, sign, -sign);
 #else
    double sign = (x >= 0.0) ? PI : NEG_PI;
    double num_cycles = absx + TWO_EXP_52;
    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;

@@ -187,18 +217,36 @@ double lol_sin(double x)
    is_even -= TWO * num_cycles - ONE;
    sign *= is_even;
 #endif
    double norm_x = absx - num_cycles;
    absx -= num_cycles;

 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
    if (lol_fabs(absx) > QUARTER)
    {
        sign = (x * absx >= 0.0) ? is_even : -is_even;

        double k = HALF - lol_fabs(absx);
        double x2 = k * k;
        double x4 = x2 * x2;
        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
        double taylor = (sub1 * x2 + sub2) * x2 + ONE;

        return taylor * sign;
    }
 #endif

    /* Computing x^4 is one multiplication too many we do, but it helps
     * interleave the Taylor series operations a lot better. */
    double x2 = norm_x * norm_x;
    double x2 = absx * absx;
    double x4 = x2 * x2;
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
    double sub1 = SC[3] * x4 + SC[1];
    double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
 #else
    double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1];
    double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
 #endif
    double taylor = (sub1 * x2 + sub2) * x2 + ONE;

    double result = norm_x * taylor;
    return result * sign;
    return absx * taylor * sign;
 }

 } /* namespace lol */