浏览代码

core: minor refactoring in the float / half conversions to accomodate

for future array versions.
legacy
Sam Hocevar sam 14 年前
父节点
当前提交
2d85e76ec8
共有 4 个文件被更改,包括 133 次插入81 次删除
  1. +104
    -47
      src/half.cpp
  2. +9
    -14
      src/half.h
  3. +18
    -18
      test/half.cpp
  4. +2
    -2
      test/lol-bench.cpp

+ 104
- 47
src/half.cpp 查看文件

@@ -19,16 +19,22 @@ using namespace std;
namespace lol namespace lol
{ {


/* These macros implement a finite iterator useful to build lookup
* tables. For instance, S64(0) will call S1(x) for all values of x
* between 0 and 63.
* Due to the exponential behaviour of the calls, the stress on the
* compiler may be important. */
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)

/* Lookup table-based algorithm from “Fast Half Float Conversions” /* Lookup table-based algorithm from “Fast Half Float Conversions”
* by Jeroen van der Zijp, November 2008. No rounding is performed, * by Jeroen van der Zijp, November 2008. No rounding is performed,
* and some NaN values may be incorrectly converted to Inf. */ * and some NaN values may be incorrectly converted to Inf. */
half half::makefast(float f)
static inline uint16_t float_to_half_nobranch(uint32_t x)
{ {
#define S4(x) S1(4*(x)), S1(4*(x)+1), S1(4*(x)+2), S1(4*(x)+3)
#define S16(x) S4(4*(x)), S4(4*(x)+1), S4(4*(x)+2), S4(4*(x)+3)
#define S64(x) S16(4*(x)), S16(4*(x)+1), S16(4*(x)+2), S16(4*(x)+3)
#define S256(x) S64(4*(x)), S64(4*(x)+1), S64(4*(x)+2), S64(4*(x)+3)

static uint16_t const basetable[512] = static uint16_t const basetable[512] =
{ {
#define S1(i) (((i) < 103) ? 0x0000: \ #define S1(i) (((i) < 103) ? 0x0000: \
@@ -52,28 +58,24 @@ half half::makefast(float f)
#undef S1 #undef S1
}; };


union { float f; uint32_t x; } u = { f };

uint16_t bits = basetable[(u.x >> 23) & 0x1ff];
bits |= (u.x & 0x007fffff) >> shifttable[(u.x >> 23) & 0x1ff];
return makebits(bits);
uint16_t bits = basetable[(x >> 23) & 0x1ff];
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
return bits;
} }


/* This method is faster than the OpenEXR implementation (very often /* This method is faster than the OpenEXR implementation (very often
* used, eg. in Ogre), with the additional benefit of rounding, inspired * used, eg. in Ogre), with the additional benefit of rounding, inspired
* by James Tursa’s half-precision code. */ * by James Tursa’s half-precision code. */
half half::makeslow(float f)
static inline uint16_t float_to_half_branch(uint32_t x)
{ {
union { float f; uint32_t x; } u = { f };

uint16_t bits = (u.x >> 16) & 0x8000; /* Get the sign */
uint16_t m = (u.x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (u.x >> 23) & 0xff; /* Using int is faster here */
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */


/* If zero, or denormal, or exponent underflows too much for a denormal, /* If zero, or denormal, or exponent underflows too much for a denormal,
* return signed zero. */ * return signed zero. */
if (e < 103) if (e < 103)
return makebits(bits);
return bits;


/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142) if (e > 142)
@@ -81,8 +83,8 @@ half half::makeslow(float f)
bits |= 0x7c00u; bits |= 0x7c00u;
/* If exponent was 0xff and one mantissa bit was set, it means NaN, /* If exponent was 0xff and one mantissa bit was set, it means NaN,
* not Inf, so make sure we set one mantissa bit too. */ * not Inf, so make sure we set one mantissa bit too. */
bits |= e == 255 && (u.x & 0x007fffffu);
return makebits(bits);
bits |= e == 255 && (x & 0x007fffffu);
return bits;
} }


/* If exponent underflows but not too much, return a denormal */ /* If exponent underflows but not too much, return a denormal */
@@ -92,51 +94,92 @@ half half::makeslow(float f)
/* Extra rounding may overflow and set mantissa to 0 and exponent /* Extra rounding may overflow and set mantissa to 0 and exponent
* to 1, which is OK. */ * to 1, which is OK. */
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return makebits(bits);
return bits;
} }


bits |= ((e - 112) << 10) | (m >> 1); bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment /* Extra rounding. An overflow will set mantissa to 0 and increment
* the exponent, which is OK. */ * the exponent, which is OK. */
bits += m & 1; bits += m & 1;
return makebits(bits);
return bits;
} }


half::operator float() const
static int const shifttable[32] =
{ {
union { float f; uint32_t x; } u;
23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
};
static uint32_t const shiftmagic = 0x07c4acddu;


uint32_t s = (m_bits & 0x8000u) << 16;
/* Lookup table-based algorithm from “Fast Half Float Conversions”
* by Jeroen van der Zijp, November 2008. Tables are generated using
* the C++ preprocessor, thanks to a branchless implementation also
* used in half_to_float_branch(). This code is actually almost always
* slower than the branching one. */
static inline uint32_t half_to_float_nobranch(uint16_t x)
{
#define M3(i) ((i) | ((i) >> 1))
#define M7(i) (M3(i) | (M3(i) >> 2))
#define MF(i) (M7(i) | (M7(i) >> 4))
#define MFF(i) (MF(i) | (MF(i) >> 8))
#define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]


if ((m_bits & 0x7fffu) == 0)
static uint32_t const mantissatable[2048] =
{ {
u.x = (uint32_t)m_bits << 16;
return u.f;
}
#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
S1024(0),
#undef S1
#define S1(i) (0x38000000u + ((i) << 13))
S1024(0),
#undef S1
};


uint32_t e = m_bits & 0x7c00u;
uint32_t m = m_bits & 0x03ffu;
static uint32_t const exponenttable[64] =
{
#define S1(i) (((i) == 0) ? 0 : \
((i) < 31) ? ((i) << 23) : \
((i) == 31) ? 0x47800000u : \
((i) == 32) ? 0x80000000u : \
((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
S64(0),
#undef S1
};


if (e == 0)
static int const offsettable[64] =
{ {
static int const shifttable[32] =
{
10, 1, 9, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 7, 0,
2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 5, 6, 0,
};
#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
S64(0),
#undef S1
};


return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
+ exponenttable[x >> 10];
}

/* This algorithm is similar to the OpenEXR implementation, except it
* uses branchless code in the denormal path. */
static inline uint32_t half_to_float_branch(uint16_t x)
{
uint32_t s = (x & 0x8000u) << 16;

if ((x & 0x7fffu) == 0)
return (uint32_t)x << 16;

uint32_t e = x & 0x7c00u;
uint32_t m = x & 0x03ffu;

if (e == 0)
{
uint32_t v = m | (m >> 1); uint32_t v = m | (m >> 1);
v |= v >> 2; v |= v >> 2;
v |= v >> 4; v |= v >> 4;
v |= v >> 8; v |= v >> 8;


e = shifttable[(v * 0x07C4ACDDU) >> 27];
m <<= e;
e = shifttable[(v * shiftmagic) >> 27];


/* We don't have to remove the 10th mantissa bit because it gets /* We don't have to remove the 10th mantissa bit because it gets
* added to our underestimated exponent. */ * added to our underestimated exponent. */
u.x = s | (((112 - e) << 23) + (m << 13));
return u.f;
return s | (((125 - e) << 23) + (m << e));
} }


if (e == 0x7c00u) if (e == 0x7c00u)
@@ -144,15 +187,29 @@ half::operator float() const
/* The amd64 pipeline likes the if() better than a ternary operator /* The amd64 pipeline likes the if() better than a ternary operator
* or any other trick I could find. --sam */ * or any other trick I could find. --sam */
if (m == 0) if (m == 0)
u.x = s | 0x7f800000u;
else
u.x = s | 0x7fc00000u;

return u.f;
return s | 0x7f800000u;
return s | 0x7fc00000u;
} }


u.x = s | (((e >> 10) + 112) << 23) | (m << 13);
return s | (((e >> 10) + 112) << 23) | (m << 13);
}

half half::makefast(float f)
{
union { float f; uint32_t x; } u = { f };
return makebits(float_to_half_nobranch(u.x));
}

half half::makeslow(float f)
{
union { float f; uint32_t x; } u = { f };
return makebits(float_to_half_branch(u.x));
}


half::operator float() const
{
union { float f; uint32_t x; } u;
u.x = half_to_float_branch(bits);
return u.f; return u.f;
} }




+ 9
- 14
src/half.h 查看文件

@@ -23,9 +23,6 @@ namespace lol


class half class half
{ {
private:
uint16_t m_bits;

public: public:
inline half() { } inline half() { }


@@ -36,27 +33,22 @@ public:


inline int isnan() const inline int isnan() const
{ {
return ((m_bits & 0x7c00u) == 0x7c00u) && (m_bits & 0x03ffu);
return ((bits & 0x7c00u) == 0x7c00u) && (bits & 0x03ffu);
} }


inline int isfinite() const inline int isfinite() const
{ {
return (m_bits & 0x7c00u) != 0x7c00u;
return (bits & 0x7c00u) != 0x7c00u;
} }


inline int isinf() const inline int isinf() const
{ {
return (uint16_t)(m_bits << 1) == (0x7c00u << 1);
return (uint16_t)(bits << 1) == (0x7c00u << 1);
} }


inline int isnormal() const inline int isnormal() const
{ {
return (isfinite() && (m_bits & 0x7c00u)) || ((m_bits & 0x7fffu) == 0);
}

inline uint16_t bits()
{
return m_bits;
return (isfinite() && (bits & 0x7c00u)) || ((bits & 0x7fffu) == 0);
} }


/* Cast to other types */ /* Cast to other types */
@@ -64,7 +56,7 @@ public:
inline operator int() const { return (int)(float)*this; } inline operator int() const { return (int)(float)*this; }


/* Operations */ /* Operations */
inline half operator -() { return makebits(m_bits ^ 0x8000u); }
inline half operator -() { return makebits(bits ^ 0x8000u); }
inline half &operator +=(float f) { return (*this = (half)(*this + f)); } inline half &operator +=(float f) { return (*this = (half)(*this + f)); }
inline half &operator -=(float f) { return (*this = (half)(*this - f)); } inline half &operator -=(float f) { return (*this = (half)(*this - f)); }
inline half &operator *=(float f) { return (*this = (half)(*this * f)); } inline half &operator *=(float f) { return (*this = (half)(*this * f)); }
@@ -89,9 +81,12 @@ public:
static inline half makebits(uint16_t x) static inline half makebits(uint16_t x)
{ {
half ret; half ret;
ret.m_bits = x;
ret.bits = x;
return ret; return ret;
} }

/* Internal representation */
uint16_t bits;
}; };


inline float &operator +=(float &f, half h) { return f += (float)h; } inline float &operator +=(float &f, half h) { return f += (float)h; }


+ 18
- 18
test/half.cpp 查看文件

@@ -54,7 +54,7 @@ public:
{ {
half a = half::makebits(i); half a = half::makebits(i);
uint16_t b = i; uint16_t b = i;
CPPUNIT_ASSERT_EQUAL(a.bits(), b);
CPPUNIT_ASSERT_EQUAL(a.bits, b);
} }
} }


@@ -64,7 +64,7 @@ public:
{ {
half a = half::makeslow(pairs[i].f); half a = half::makeslow(pairs[i].f);
uint16_t b = pairs[i].x; uint16_t b = pairs[i].x;
CPPUNIT_ASSERT_EQUAL(a.bits(), b);
CPPUNIT_ASSERT_EQUAL(a.bits, b);
} }
} }


@@ -74,7 +74,7 @@ public:
{ {
half a = half::makefast(pairs[i].f); half a = half::makefast(pairs[i].f);
uint16_t b = pairs[i].x; uint16_t b = pairs[i].x;
CPPUNIT_ASSERT_EQUAL(a.bits(), b);
CPPUNIT_ASSERT_EQUAL(a.bits, b);
} }
} }


@@ -190,7 +190,7 @@ public:
else else
{ {
CPPUNIT_ASSERT(!isnan(f)); CPPUNIT_ASSERT(!isnan(f));
CPPUNIT_ASSERT_EQUAL(g.bits(), h.bits());
CPPUNIT_ASSERT_EQUAL(g.bits, h.bits);
} }
} }
} }
@@ -258,38 +258,38 @@ public:
half four = 4; half four = 4;


half a = one + 0.0f; half a = one + 0.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
a += 0.0f; a += 0.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
a -= 0.0f; a -= 0.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
a *= 1.0f; a *= 1.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);
a /= 1.0f; a /= 1.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), a.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, a.bits);


half b = one + 0.0f; half b = one + 0.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), b.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, b.bits);
b += 1.0f; b += 1.0f;
CPPUNIT_ASSERT_EQUAL(two.bits(), b.bits());
CPPUNIT_ASSERT_EQUAL(two.bits, b.bits);
b *= 2.0f; b *= 2.0f;
CPPUNIT_ASSERT_EQUAL(four.bits(), b.bits());
CPPUNIT_ASSERT_EQUAL(four.bits, b.bits);
b -= 2.0f; b -= 2.0f;
CPPUNIT_ASSERT_EQUAL(two.bits(), b.bits());
CPPUNIT_ASSERT_EQUAL(two.bits, b.bits);
b /= 2.0f; b /= 2.0f;
CPPUNIT_ASSERT_EQUAL(one.bits(), b.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, b.bits);


half c = 1.0f - zero; half c = 1.0f - zero;
CPPUNIT_ASSERT_EQUAL(one.bits(), c.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, c.bits);


half d = 2.0f - one; half d = 2.0f - one;
CPPUNIT_ASSERT_EQUAL(one.bits(), d.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, d.bits);


half e = 2.0f + (-one); half e = 2.0f + (-one);
CPPUNIT_ASSERT_EQUAL(one.bits(), e.bits());
CPPUNIT_ASSERT_EQUAL(one.bits, e.bits);


half f = (2.0f * two) / (1.0f + one); half f = (2.0f * two) / (1.0f + one);
CPPUNIT_ASSERT_EQUAL(two.bits(), f.bits());
CPPUNIT_ASSERT_EQUAL(two.bits, f.bits);
} }


private: private:


+ 2
- 2
test/lol-bench.cpp 查看文件

@@ -40,7 +40,7 @@ int main(int argc, char **argv)
u.x = i; u.x = i;


half h = half::makeslow(u.f); half h = half::makeslow(u.f);
total ^= h.bits();
total ^= h.bits;
} }
Log::Info("time for makeslow: %f (hash %04x)\n", timer.GetMs(), total); Log::Info("time for makeslow: %f (hash %04x)\n", timer.GetMs(), total);


@@ -50,7 +50,7 @@ int main(int argc, char **argv)
u.x = i; u.x = i;


half h = half::makefast(u.f); half h = half::makefast(u.f);
total ^= h.bits();
total ^= h.bits;
} }
Log::Info("time for makefast: %f (hash %04x)\n", timer.GetMs(), total); Log::Info("time for makefast: %f (hash %04x)\n", timer.GetMs(), total);




正在加载...
取消
保存