core: implement accelerated cos().

13 lat temu · 49f9c59ff3
--- a/src/core.h
+++ b/src/core.h
@@ -17,6 +17,9 @@
 #define __LOL_CORE_H__

 // CPU features
 #undef LOL_FEATURE_CHEAP_BRANCHES
 #undef LOL_FEATURE_VERY_CHEAP_BRANCHES

 #if !defined __CELLOS_LV2__
 #   define LOL_FEATURE_CHEAP_BRANCHES
 #endif
--- a/src/trig.cpp
+++ b/src/trig.cpp
@@ -206,7 +206,6 @@ double lol_sin(double x)
    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
    sign = lol_fsel(is_even, sign, -sign);
 #else
    double sign = (x >= 0.0) ? PI : NEG_PI;
    double num_cycles = absx + TWO_EXP_52;
    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;

@@ -215,17 +214,19 @@ double lol_sin(double x)
    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
    __asm__("" : "+m" (is_even));
    is_even -= TWO * num_cycles - ONE;
    sign *= is_even;
    double sign = is_even;
 #endif
    absx -= num_cycles;

    /* If branches are very cheap, we have the option to do the Taylor
     * series at a much lower degree by splitting. */
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
    if (lol_fabs(absx) > QUARTER)
    {
        sign = (x * absx >= 0.0) ? is_even : -is_even;
        sign = (x * absx >= 0.0) ? sign : -sign;

        double k = HALF - lol_fabs(absx);
        double x2 = k * k;
        double x1 = HALF - lol_fabs(absx);
        double x2 = x1 * x1;
        double x4 = x2 * x2;
        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
@@ -235,6 +236,8 @@ double lol_sin(double x)
    }
 #endif

    sign *= (x >= 0.0) ? PI : NEG_PI;

    double x2 = absx * absx;
    double x4 = x2 * x2;
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
@@ -249,5 +252,66 @@ double lol_sin(double x)
    return absx * taylor * sign;
 }

 double lol_cos(double x)
 {
    double absx = lol_fabs(x * INV_PI);

 #if defined LOL_FEATURE_CHEAP_BRANCHES
    if (absx < QUARTER)
    {
        double x2 = absx * absx;
        double x4 = x2 * x2;
        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
        return taylor;
    }
 #endif

 #if defined __CELLOS_LV2__
    double num_cycles = lol_round(absx);
    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
    double sign = lol_fsel(is_even, ONE, NEG_ONE);
 #else
    double num_cycles = absx + TWO_EXP_52;
    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;

    double is_even = TWO * num_cycles - ONE;
    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
    __asm__("" : "+m" (is_even));
    is_even -= TWO * num_cycles - ONE;
    double sign = is_even;
 #endif
    absx -= num_cycles;

 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
    if (lol_fabs(absx) > QUARTER)
    {
        double x1 = HALF - lol_fabs(absx);
        double x2 = x1 * x1;
        double x4 = x2 * x2;
        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
        double taylor = sub2 * x2 + sub1;

        return x1 * taylor * sign * PI;
    }
 #endif

    double x2 = absx * absx;
    double x4 = x2 * x2;
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
    double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
 #else
    double sub1 = ((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1];
    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
 #endif
    double taylor = (sub1 * x2 + sub2) * x2 + ONE;

    return taylor * sign;
 }

 } /* namespace lol */

--- a/test/lol-bench.cpp
+++ b/test/lol-bench.cpp
@@ -74,7 +74,7 @@ int main(int argc, char **argv)

 static void bench_trig(int mode)
 {
    float result[5] = { 0.0f };
    float result[7] = { 0.0f };
    Timer timer;

    /* Set up tables */
@@ -127,11 +127,27 @@ static void bench_trig(int mode)
            pf2[i] = __builtin_cosf(pf[i]);
        result[3] += timer.GetMs();

        /* Fast cos */
        timer.GetMs();
        for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
 #if defined HAVE_FASTMATH_H
            pf2[i] = f_cosf(pf[i]);
 #else
            pf2[i] = cosf(pf[i]);
 #endif
        result[4] += timer.GetMs();

        /* Lol cos */
        timer.GetMs();
        for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
            pf2[i] = lol_cos(pf[i]);
        result[5] += timer.GetMs();

        /* Tan */
        timer.GetMs();
        for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
            pf2[i] = __builtin_tanf(pf[i]);
        result[4] += timer.GetMs();
        result[6] += timer.GetMs();
    }

    delete[] pf;
@@ -145,7 +161,9 @@ static void bench_trig(int mode)
    Log::Info("float = fastsinf(float)  %7.3f\n", result[1]);
    Log::Info("float = lol_sinf(float)  %7.3f\n", result[2]);
    Log::Info("float = cosf(float)      %7.3f\n", result[3]);
    Log::Info("float = tanf(float)      %7.3f\n", result[4]);
    Log::Info("float = fastcosf(float)  %7.3f\n", result[4]);
    Log::Info("float = lol_cosf(float)  %7.3f\n", result[5]);
    Log::Info("float = tanf(float)      %7.3f\n", result[6]);
 }

 static void bench_matrix(int mode)
--- a/test/trig.cpp
+++ b/test/trig.cpp
@@ -54,6 +54,22 @@ public:
            double b = lol_sin(f);
            CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
        }

        for (int i = -10000; i < 10000; i++)
        {
            double f = (double)i * (1.0 / 1000.0);
            double a = __builtin_cos(f);
            double b = lol_cos(f);
            CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
        }

        for (int i = -10000; i < 10000; i++)
        {
            double f = (double)i * (1.0 / 100000.0);
            double a = __builtin_cos(f);
            double b = lol_cos(f);
            CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
        }
    }
 };