瀏覽代碼

core: add a code shortcut for sin() on platforms that have cheap branches.

legacy
Sam Hocevar sam 13 年之前
父節點
當前提交
a65c472ffd
共有 2 個檔案被更改,包括 70 行新增17 行删除
  1. +5
    -0
      src/core.h
  2. +65
    -17
      src/trig.cpp

+ 5
- 0
src/core.h 查看文件

@@ -16,6 +16,11 @@
#if !defined __LOL_CORE_H__
#define __LOL_CORE_H__

// CPU features
#if !defined __CELLOS_LV2__
# define LOL_FEATURE_CHEAP_BRANCHES
#endif

// Base types
#include "trig.h"
#include "half.h"


+ 65
- 17
src/trig.cpp 查看文件

@@ -23,24 +23,25 @@ using namespace std;
namespace lol
{

static const double PI = 3.14159265358979323846;
static const double NEG_PI = -3.14159265358979323846;
static const double PI_2 = PI / 2.0;
static const double PI_4 = PI / 4.0;
static const double INV_PI = 1.0 / PI;
static const double ROOT3 = 1.732050807568877293527;
static const double PI = 3.14159265358979323846264;
static const double NEG_PI = -3.14159265358979323846264;
static const double PI_2 = 1.57079632679489661923132;
static const double PI_4 = 0.785398163397448309615661;
static const double INV_PI = 0.318309886183790671537768;
static const double ROOT3 = 1.73205080756887729352745;

static const double ZERO = 0.0;
static const double ONE = 1.0;
static const double NEG_ONE = -1.0;
static const double HALF = 0.5;
static const double QUARTER = 0.25;
static const double TWO = 2.0;
static const double VERY_SMALL_NUMBER = 0x1.0p-128;
static const double TWO_EXP_52 = 4503599627370496.0;
static const double TWO_EXP_54 = 18014398509481984.0;

/** sin Taylor series coefficients. */
static const double SC[] =
static const double SC[] =
{
-1.6449340668482264364724e-0, // pi^2/3!
+8.1174242528335364363700e-1, // pi^4/5!
@@ -52,6 +53,18 @@ static const double SC[] =
+2.5312174041370276513517e-7, // pi^16/17!
};

static const double CC[] =
{
-4.9348022005446793094172e-0, // pi^2/2!
+4.0587121264167682181850e-0, // pi^4/4!
-1.3352627688545894958753e-0, // pi^6/6!
+2.3533063035889320454188e-1, // pi^8/8!
-2.5806891390014060012598e-2, // pi^10/10!
+1.9295743094039230479033e-3, // pi^12/12!
-1.0463810492484570711802e-4, // pi^14/14!
+4.3030695870329470072978e-6, // pi^16/16!
};

/* Custom intrinsics */
#define INLINEATTR __attribute__((always_inline))

@@ -167,16 +180,33 @@ static inline double lol_trunc(double x)
double lol_sin(double x)
{
double absx = lol_fabs(x * INV_PI);
double sign = lol_fsel(x, PI, NEG_PI);

/* To compute sin(x) we build a Taylor series for |x|/pi wrapped to
* the range [-1, 1]. We also switch the result sign if the number
* of cycles is odd. */
/* If branches are cheap, skip the cycle count when |x| < π/4,
* and only do the Taylor series up to the required precision. */
#if defined LOL_FEATURE_CHEAP_BRANCHES
if (absx < QUARTER)
{
/* Computing x^4 is one multiplication too many we do, but it helps
* interleave the Taylor series operations a lot better. */
double x2 = absx * absx;
double x4 = x2 * x2;
double sub1 = SC[3] * x4 + SC[1];
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
double taylor = (sub1 * x2 + sub2) * x2 + ONE;
return x * taylor;
}
#endif

/* Wrap |x| to the range [-1, 1] and keep track of the number of
* cycles required. If odd, we'll need to change the sign of the
* result. */
#if defined __CELLOS_LV2__
double sign = lol_fsel(x, PI, NEG_PI);
double num_cycles = lol_round(absx);
double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
sign = lol_fsel(is_even, sign, -sign);
#else
double sign = (x >= 0.0) ? PI : NEG_PI;
double num_cycles = absx + TWO_EXP_52;
__asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;

@@ -187,18 +217,36 @@ double lol_sin(double x)
is_even -= TWO * num_cycles - ONE;
sign *= is_even;
#endif
double norm_x = absx - num_cycles;
absx -= num_cycles;

#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
if (lol_fabs(absx) > QUARTER)
{
sign = (x * absx >= 0.0) ? is_even : -is_even;

double k = HALF - lol_fabs(absx);
double x2 = k * k;
double x4 = x2 * x2;
double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
double taylor = (sub1 * x2 + sub2) * x2 + ONE;

return taylor * sign;
}
#endif

/* Computing x^4 is one multiplication too many we do, but it helps
* interleave the Taylor series operations a lot better. */
double x2 = norm_x * norm_x;
double x2 = absx * absx;
double x4 = x2 * x2;
#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
double sub1 = SC[3] * x4 + SC[1];
double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
#else
double sub1 = ((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1];
double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
#endif
double taylor = (sub1 * x2 + sub2) * x2 + ONE;

double result = norm_x * taylor;
return result * sign;
return absx * taylor * sign;
}

} /* namespace lol */


Loading…
取消
儲存