@@ -5,7 +5,6 @@ | |||||
src/lol/math/bigint.h | src/lol/math/bigint.h | ||||
src/lol/math/constants.h | src/lol/math/constants.h | ||||
src/lol/math/functions.h | src/lol/math/functions.h | ||||
src/lol/math/half.h (needs half.cpp) | |||||
src/lol/math/noise/* | src/lol/math/noise/* | ||||
@@ -1,7 +1,7 @@ | |||||
// | // | ||||
// Lol Engine | // Lol Engine | ||||
// | // | ||||
// Copyright © 2010—2019 Sam Hocevar <sam@hocevar.net> | |||||
// Copyright © 2010—2020 Sam Hocevar <sam@hocevar.net> | |||||
// | // | ||||
// Lol Engine is free software. It comes without any warranty, to | // Lol Engine is free software. It comes without any warranty, to | ||||
// the extent permitted by applicable law. You can redistribute it | // the extent permitted by applicable law. You can redistribute it | ||||
@@ -17,18 +17,19 @@ | |||||
// -------------- | // -------------- | ||||
// | // | ||||
#include <lol/base/types.h> | |||||
#include <cmath> | #include <cmath> | ||||
#include <cstdio> | |||||
#include <stdint.h> | |||||
#include <ostream> // std::ostream | |||||
#include <stdint.h> // uint32_t etc. | |||||
namespace lol | namespace lol | ||||
{ | { | ||||
/* This is OUR namespace. Don't let Windows headers mess with it. */ | |||||
#undef min | |||||
#undef max | |||||
#if _WIN32 | |||||
# pragma push_macro("near") | |||||
# pragma push_macro("far") | |||||
# undef near | |||||
# undef far | |||||
#endif | |||||
namespace half_ops { struct base {}; } | namespace half_ops { struct base {}; } | ||||
@@ -36,39 +37,85 @@ class [[nodiscard]] half | |||||
: half_ops::base | : half_ops::base | ||||
{ | { | ||||
public: | public: | ||||
/* Constructors. Always inline so that the code can work in registers | |||||
* instead of calling routines with the hidden "this" parameter. */ | |||||
inline half() { } | |||||
inline half(int f) { *this = makefast((float)f); } | |||||
inline half(float f) { *this = makefast(f); } | |||||
inline half(double f) { *this = makefast((float)f); } | |||||
inline half(ldouble f) { *this = makefast((float)f); } | |||||
// Constructors. Always inline so that the code can work in registers. | |||||
inline half() {} | |||||
explicit inline half(int f) : half(float(f)) {} | |||||
explicit inline half(double f) : half(float(f)) {} | |||||
explicit inline half(ldouble f) : half(float(f)) {} | |||||
explicit inline half(float f) | |||||
{ | |||||
union { float f; uint32_t x; } u = { f }; | |||||
// This method is faster than the OpenEXR implementation (very often | |||||
// used, eg. in Ogre), with the additional benefit of rounding, inspired | |||||
// by James Tursa’s half-precision code. | |||||
m_bits = (u.x >> 16) & 0x8000; // Get the sign | |||||
uint16_t m = (u.x >> 12) & 0x07ff; // Keep one extra bit for rounding | |||||
unsigned int e = (u.x >> 23) & 0xff; // Using int is faster here | |||||
if (e < 103) | |||||
{ | |||||
// If zero, or denormal, or exponent underflows too much for a denormal | |||||
// half, return signed zero. | |||||
} | |||||
else if (e > 142) | |||||
{ | |||||
// If NaN, return NaN. If Inf or exponent overflow, return Inf. | |||||
m_bits |= 0x7c00u; | |||||
// If exponent was 0xff and one mantissa bit was set, it means NaN, | |||||
// not Inf, so make sure we set one mantissa bit too. | |||||
m_bits |= e == 255 && (u.x & 0x007fffffu); | |||||
} | |||||
else if (e < 113) | |||||
{ | |||||
// If exponent underflows but not too much, return a denormal | |||||
m |= 0x0800u; | |||||
// Extra rounding may overflow and set mantissa to 0 and exponent | |||||
// to 1, which is OK. | |||||
m_bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); | |||||
} | |||||
else | |||||
{ | |||||
m_bits |= ((e - 112) << 10) | (m >> 1); | |||||
// Extra rounding. An overflow will set mantissa to 0 and increment | |||||
// the exponent, which is OK. | |||||
m_bits += m & 1; | |||||
} | |||||
} | |||||
[[nodiscard]] inline int is_nan() const | [[nodiscard]] inline int is_nan() const | ||||
{ | { | ||||
return ((bits & 0x7c00u) == 0x7c00u) && (bits & 0x03ffu); | |||||
return ((m_bits & 0x7c00u) == 0x7c00u) && (m_bits & 0x03ffu); | |||||
} | } | ||||
[[nodiscard]] inline int is_finite() const | [[nodiscard]] inline int is_finite() const | ||||
{ | { | ||||
return (bits & 0x7c00u) != 0x7c00u; | |||||
return (m_bits & 0x7c00u) != 0x7c00u; | |||||
} | } | ||||
[[nodiscard]] inline int is_inf() const | [[nodiscard]] inline int is_inf() const | ||||
{ | { | ||||
return (uint16_t)(bits << 1) == (0x7c00u << 1); | |||||
return (uint16_t)(m_bits << 1) == (0x7c00u << 1); | |||||
} | } | ||||
[[nodiscard]] inline int is_normal() const | [[nodiscard]] inline int is_normal() const | ||||
{ | { | ||||
return (is_finite() && (bits & 0x7c00u)) || ((bits & 0x7fffu) == 0); | |||||
return (is_finite() && (m_bits & 0x7c00u)) || ((m_bits & 0x7fffu) == 0); | |||||
} | } | ||||
/* Cast to other types -- always inline, see constructors */ | |||||
inline half &operator =(int f) { return *this = makefast((float)f); } | |||||
inline half &operator =(float f) { return *this = makefast(f); } | |||||
inline half &operator =(double f) { return *this = makefast((float)f); } | |||||
inline half &operator =(ldouble f) { return *this = makefast((float)f); } | |||||
// Convert to string | |||||
friend std::ostream &operator<<(std::ostream &stream, half const &v) | |||||
{ | |||||
return stream << float(v); | |||||
} | |||||
// Cast to other types -- always inline, see constructors | |||||
inline half &operator =(int f) { return *this = half(f); } | |||||
inline half &operator =(float f) { return *this = half(f); } | |||||
inline half &operator =(double f) { return *this = half(f); } | |||||
inline half &operator =(ldouble f) { return *this = half(f); } | |||||
[[nodiscard]] inline operator int8_t() const { return (int8_t)(float)*this; } | [[nodiscard]] inline operator int8_t() const { return (int8_t)(float)*this; } | ||||
[[nodiscard]] inline operator uint8_t() const { return (uint8_t)(float)*this; } | [[nodiscard]] inline operator uint8_t() const { return (uint8_t)(float)*this; } | ||||
[[nodiscard]] inline operator int16_t() const { return (int16_t)(float)*this; } | [[nodiscard]] inline operator int16_t() const { return (int16_t)(float)*this; } | ||||
@@ -78,15 +125,16 @@ public: | |||||
[[nodiscard]] inline operator int64_t() const { return (int64_t)(float)*this; } | [[nodiscard]] inline operator int64_t() const { return (int64_t)(float)*this; } | ||||
[[nodiscard]] inline operator uint64_t() const { return (uint64_t)(float)*this; } | [[nodiscard]] inline operator uint64_t() const { return (uint64_t)(float)*this; } | ||||
[[nodiscard]] operator float() const; | |||||
[[nodiscard]] inline operator float() const | |||||
{ | |||||
union { uint32_t x; float f; } u = { tofloatbits() }; | |||||
return u.f; | |||||
} | |||||
[[nodiscard]] inline operator double() const { return (float)(*this); } | [[nodiscard]] inline operator double() const { return (float)(*this); } | ||||
[[nodiscard]] inline operator ldouble() const { return (float)(*this); } | [[nodiscard]] inline operator ldouble() const { return (float)(*this); } | ||||
/* Array conversions */ | |||||
static void convert(half *dst, float const *src, size_t nelem); | |||||
static void convert(float *dst, half const *src, size_t nelem); | |||||
/* Operations */ | |||||
// Operators | |||||
[[nodiscard]] bool operator ==(half x) const { return (float)*this == (float)x; } | [[nodiscard]] bool operator ==(half x) const { return (float)*this == (float)x; } | ||||
[[nodiscard]] bool operator !=(half x) const { return (float)*this != (float)x; } | [[nodiscard]] bool operator !=(half x) const { return (float)*this != (float)x; } | ||||
[[nodiscard]] bool operator <(half x) const { return (float)*this < (float)x; } | [[nodiscard]] bool operator <(half x) const { return (float)*this < (float)x; } | ||||
@@ -94,10 +142,10 @@ public: | |||||
[[nodiscard]] bool operator <=(half x) const { return (float)*this <= (float)x; } | [[nodiscard]] bool operator <=(half x) const { return (float)*this <= (float)x; } | ||||
[[nodiscard]] bool operator >=(half x) const { return (float)*this >= (float)x; } | [[nodiscard]] bool operator >=(half x) const { return (float)*this >= (float)x; } | ||||
[[nodiscard]] bool operator !() const { return !(bits & 0x7fffu); } | |||||
[[nodiscard]] bool operator !() const { return !(m_bits & 0x7fffu); } | |||||
[[nodiscard]] operator bool() const { return !!*this; } | [[nodiscard]] operator bool() const { return !!*this; } | ||||
inline half operator -() const { return makebits(bits ^ 0x8000u); } | |||||
inline half operator -() const { return frombits(m_bits ^ 0x8000u); } | |||||
inline half operator +() const { return *this; } | inline half operator +() const { return *this; } | ||||
inline half &operator +=(half h) { return (*this = (half)(*this + h)); } | inline half &operator +=(half h) { return (*this = (half)(*this + h)); } | ||||
inline half &operator -=(half h) { return (*this = (half)(*this - h)); } | inline half &operator -=(half h) { return (*this = (half)(*this - h)); } | ||||
@@ -109,25 +157,78 @@ public: | |||||
[[nodiscard]] inline float operator *(half h) const { return (float)*this * (float)h; } | [[nodiscard]] inline float operator *(half h) const { return (float)*this * (float)h; } | ||||
[[nodiscard]] inline float operator /(half h) const { return (float)*this / (float)h; } | [[nodiscard]] inline float operator /(half h) const { return (float)*this / (float)h; } | ||||
/* Factories */ | |||||
static half makefast(float f); | |||||
static half makeaccurate(float f); | |||||
static inline half makebits(uint16_t x) | |||||
inline uint16_t bits() const { return m_bits; } | |||||
static inline half frombits(uint16_t x) | |||||
{ | { | ||||
half ret; | half ret; | ||||
ret.bits = x; | |||||
ret.m_bits = x; | |||||
return ret; | return ret; | ||||
} | } | ||||
/* Internal representation */ | |||||
uint16_t bits; | |||||
private: | |||||
inline uint32_t tofloatbits() const | |||||
{ | |||||
// This algorithm is similar to the OpenEXR implementation, except it | |||||
// uses branchless code in the denormal path. This is slower than the | |||||
// table version, but will be more friendly to the cache for occasional | |||||
// uses. | |||||
uint32_t s = (m_bits & 0x8000u) << 16; | |||||
if ((m_bits & 0x7fffu) == 0) | |||||
return (uint32_t)m_bits << 16; | |||||
uint32_t e = m_bits & 0x7c00u; | |||||
uint32_t m = m_bits & 0x03ffu; | |||||
if (e == 0) | |||||
{ | |||||
// We use this magic table, inspired by De Bruijn sequences, to | |||||
// compute a branchless integer log2. The actual value fetched is | |||||
// 24-log2(x+1) for x in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, | |||||
// 3fd, 3fe, 3ff. For an explanation of how 0x5a1a1a2u was obtained | |||||
// see: http://lolengine.net/blog/2012/04/03/beyond-de-bruijn | |||||
static uint32_t const shifttable[16] = | |||||
{ | |||||
23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0, | |||||
}; | |||||
static uint32_t const shiftmagic = 0x5a1a1a2u; | |||||
/* m has 10 significant bits but replicating the leading bit to | |||||
* 8 positions instead of 16 works just as well because of our | |||||
* handcrafted shiftmagic table. */ | |||||
uint32_t v = m | (m >> 1); | |||||
v |= v >> 2; | |||||
v |= v >> 4; | |||||
e = shifttable[(v * shiftmagic) >> 28]; | |||||
/* We don't have to remove the 10th mantissa bit because it gets | |||||
* added to our underestimated exponent. */ | |||||
return s | (((125 - e) << 23) + (m << e)); | |||||
} | |||||
if (e == 0x7c00u) | |||||
{ | |||||
/* The amd64 pipeline likes the if() better than a ternary operator | |||||
* or any other trick I could find. --sam */ | |||||
if (m == 0) | |||||
return s | 0x7f800000u; | |||||
return s | 0x7fc00000u; | |||||
} | |||||
return s | (((e >> 10) + 112) << 23) | (m << 13); | |||||
} | |||||
// Internal representation | |||||
uint16_t m_bits; | |||||
}; | }; | ||||
static_assert(sizeof(half) == 2, "sizeof(half) == 2"); | static_assert(sizeof(half) == 2, "sizeof(half) == 2"); | ||||
/* | |||||
* Standard math and GLSL functions | |||||
*/ | |||||
// | |||||
// Standard math and GLSL functions | |||||
// | |||||
static inline half min(half a, half b) { return a < b ? a : b; } | static inline half min(half a, half b) { return a < b ? a : b; } | ||||
static inline half max(half a, half b) { return a > b ? a : b; } | static inline half max(half a, half b) { return a > b ? a : b; } | ||||
@@ -139,20 +240,20 @@ static inline float fmod(half a, half b) | |||||
static inline float fract(half a) { return fract((float)a); } | static inline float fract(half a) { return fract((float)a); } | ||||
static inline float degrees(half a) { return degrees((float)a); } | static inline float degrees(half a) { return degrees((float)a); } | ||||
static inline float radians(half a) { return radians((float)a); } | static inline float radians(half a) { return radians((float)a); } | ||||
static inline half abs(half a) { return half::makebits(a.bits & 0x7fffu); } | |||||
static inline half abs(half a) { return half::frombits(a.bits() & 0x7fffu); } | |||||
static inline half clamp(half x, half a, half b) | static inline half clamp(half x, half a, half b) | ||||
{ | { | ||||
return (x < a) ? a : (x > b) ? b : x; | return (x < a) ? a : (x > b) ? b : x; | ||||
} | } | ||||
/* | |||||
* Standard math operators | |||||
*/ | |||||
// | |||||
// Standard math operators | |||||
// | |||||
namespace half_ops | namespace half_ops | ||||
{ | { | ||||
/* Enumerate the types for which operations with half are valid */ | |||||
// Enumerate the types for which operations with half are valid | |||||
template<typename FROM, typename TO = void> struct valid {}; | template<typename FROM, typename TO = void> struct valid {}; | ||||
template<typename TO> struct valid<uint8_t, TO> | template<typename TO> struct valid<uint8_t, TO> | ||||
@@ -227,7 +328,11 @@ DECLARE_HALF_BOOL_OPS(<=) | |||||
#undef DECLARE_HALF_BOOL_OPS | #undef DECLARE_HALF_BOOL_OPS | ||||
} /* namespace half_ops */ | |||||
} // namespace half_ops | |||||
} /* namespace lol */ | |||||
} // namespace lol | |||||
#if _WIN32 | |||||
# pragma pop_macro("near") | |||||
# pragma pop_macro("far") | |||||
#endif |
@@ -29,8 +29,6 @@ | |||||
# undef far | # undef far | ||||
#endif | #endif | ||||
#define have_lol_matrix_h | |||||
namespace lol | namespace lol | ||||
{ | { | ||||
@@ -120,7 +118,6 @@ private: | |||||
}; | }; | ||||
static_assert(sizeof(imat2) == 16, "sizeof(imat2) == 16"); | static_assert(sizeof(imat2) == 16, "sizeof(imat2) == 16"); | ||||
static_assert(sizeof(f16mat2) == 8, "sizeof(f16mat2) == 8"); | |||||
static_assert(sizeof(mat2) == 16, "sizeof(mat2) == 16"); | static_assert(sizeof(mat2) == 16, "sizeof(mat2) == 16"); | ||||
static_assert(sizeof(dmat2) == 32, "sizeof(dmat2) == 32"); | static_assert(sizeof(dmat2) == 32, "sizeof(dmat2) == 32"); | ||||
@@ -211,7 +208,6 @@ private: | |||||
}; | }; | ||||
static_assert(sizeof(imat3) == 36, "sizeof(imat3) == 36"); | static_assert(sizeof(imat3) == 36, "sizeof(imat3) == 36"); | ||||
static_assert(sizeof(f16mat3) == 18, "sizeof(f16mat3) == 18"); | |||||
static_assert(sizeof(mat3) == 36, "sizeof(mat3) == 36"); | static_assert(sizeof(mat3) == 36, "sizeof(mat3) == 36"); | ||||
static_assert(sizeof(dmat3) == 72, "sizeof(dmat3) == 72"); | static_assert(sizeof(dmat3) == 72, "sizeof(dmat3) == 72"); | ||||
@@ -345,7 +341,6 @@ private: | |||||
}; | }; | ||||
static_assert(sizeof(imat4) == 64, "sizeof(imat4) == 64"); | static_assert(sizeof(imat4) == 64, "sizeof(imat4) == 64"); | ||||
static_assert(sizeof(f16mat4) == 32, "sizeof(f16mat4) == 32"); | |||||
static_assert(sizeof(mat4) == 64, "sizeof(mat4) == 64"); | static_assert(sizeof(mat4) == 64, "sizeof(mat4) == 64"); | ||||
static_assert(sizeof(dmat4) == 128, "sizeof(dmat4) == 128"); | static_assert(sizeof(dmat4) == 128, "sizeof(dmat4) == 128"); | ||||
@@ -20,8 +20,6 @@ | |||||
#include <ostream> | #include <ostream> | ||||
#include <type_traits> | #include <type_traits> | ||||
#include <lol/math/half.h> | |||||
namespace lol | namespace lol | ||||
{ | { | ||||
@@ -71,7 +71,6 @@ struct [[nodiscard]] cmplx_t : public linear_ops::base<T> | |||||
T x, y; | T x, y; | ||||
}; | }; | ||||
static_assert(sizeof(f16cmplx) == 4, "sizeof(f16cmplx) == 4"); | |||||
static_assert(sizeof(cmplx) == 8, "sizeof(cmplx) == 8"); | static_assert(sizeof(cmplx) == 8, "sizeof(cmplx) == 8"); | ||||
static_assert(sizeof(dcmplx) == 16, "sizeof(dcmplx) == 16"); | static_assert(sizeof(dcmplx) == 16, "sizeof(dcmplx) == 16"); | ||||
@@ -250,7 +249,6 @@ struct [[nodiscard]] quat_t : public linear_ops::base<T> | |||||
T w, x, y, z; | T w, x, y, z; | ||||
}; | }; | ||||
static_assert(sizeof(f16quat) == 8, "sizeof(f16quat) == 8"); | |||||
static_assert(sizeof(quat) == 16, "sizeof(quat) == 16"); | static_assert(sizeof(quat) == 16, "sizeof(quat) == 16"); | ||||
static_assert(sizeof(dquat) == 32, "sizeof(dquat) == 32"); | static_assert(sizeof(dquat) == 32, "sizeof(dquat) == 32"); | ||||
@@ -20,7 +20,7 @@ | |||||
#include <lol/math/private/ops.h> | #include <lol/math/private/ops.h> | ||||
#include <cassert> | #include <cassert> | ||||
#include <ostream> | |||||
#include <ostream> // std::ostream | |||||
#include <type_traits> | #include <type_traits> | ||||
namespace lol | namespace lol | ||||
@@ -301,7 +301,6 @@ static_assert(sizeof(i16vec2) == 4, "sizeof(i16vec2) == 4"); | |||||
static_assert(sizeof(ivec2) == 8, "sizeof(ivec2) == 8"); | static_assert(sizeof(ivec2) == 8, "sizeof(ivec2) == 8"); | ||||
static_assert(sizeof(i64vec2) == 16, "sizeof(i64vec2) == 16"); | static_assert(sizeof(i64vec2) == 16, "sizeof(i64vec2) == 16"); | ||||
static_assert(sizeof(f16vec2) == 4, "sizeof(f16vec2) == 4"); | |||||
static_assert(sizeof(vec2) == 8, "sizeof(vec2) == 8"); | static_assert(sizeof(vec2) == 8, "sizeof(vec2) == 8"); | ||||
static_assert(sizeof(dvec2) == 16, "sizeof(dvec2) == 16"); | static_assert(sizeof(dvec2) == 16, "sizeof(dvec2) == 16"); | ||||
@@ -542,7 +541,6 @@ static_assert(sizeof(i16vec3) == 6, "sizeof(i16vec3) == 6"); | |||||
static_assert(sizeof(ivec3) == 12, "sizeof(ivec3) == 12"); | static_assert(sizeof(ivec3) == 12, "sizeof(ivec3) == 12"); | ||||
static_assert(sizeof(i64vec3) == 24, "sizeof(i64vec3) == 24"); | static_assert(sizeof(i64vec3) == 24, "sizeof(i64vec3) == 24"); | ||||
static_assert(sizeof(f16vec3) == 6, "sizeof(f16vec3) == 6"); | |||||
static_assert(sizeof(vec3) == 12, "sizeof(vec3) == 12"); | static_assert(sizeof(vec3) == 12, "sizeof(vec3) == 12"); | ||||
static_assert(sizeof(dvec3) == 24, "sizeof(dvec3) == 24"); | static_assert(sizeof(dvec3) == 24, "sizeof(dvec3) == 24"); | ||||
@@ -975,7 +973,6 @@ static_assert(sizeof(i16vec4) == 8, "sizeof(i16vec4) == 8"); | |||||
static_assert(sizeof(ivec4) == 16, "sizeof(ivec4) == 16"); | static_assert(sizeof(ivec4) == 16, "sizeof(ivec4) == 16"); | ||||
static_assert(sizeof(i64vec4) == 32, "sizeof(i64vec4) == 32"); | static_assert(sizeof(i64vec4) == 32, "sizeof(i64vec4) == 32"); | ||||
static_assert(sizeof(f16vec4) == 8, "sizeof(f16vec4) == 8"); | |||||
static_assert(sizeof(vec4) == 16, "sizeof(vec4) == 16"); | static_assert(sizeof(vec4) == 16, "sizeof(vec4) == 16"); | ||||
static_assert(sizeof(dvec4) == 32, "sizeof(dvec4) == 32"); | static_assert(sizeof(dvec4) == 32, "sizeof(dvec4) == 32"); | ||||
@@ -1,248 +0,0 @@ | |||||
// | |||||
// Lol Engine | |||||
// | |||||
// Copyright © 2010—2019 Sam Hocevar <sam@hocevar.net> | |||||
// | |||||
// Lol Engine is free software. It comes without any warranty, to | |||||
// the extent permitted by applicable law. You can redistribute it | |||||
// and/or modify it under the terms of the Do What the Fuck You Want | |||||
// to Public License, Version 2, as published by the WTFPL Task Force. | |||||
// See http://www.wtfpl.net/ for more details. | |||||
// | |||||
#include <lol/engine-internal.h> | |||||
namespace lol | |||||
{ | |||||
/* These macros implement a finite iterator useful to build lookup | |||||
* tables. For instance, S64(0) will call S1(x) for all values of x | |||||
* between 0 and 63. | |||||
* Due to the exponential behaviour of the calls, the stress on the | |||||
* compiler may be important. */ | |||||
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3) | |||||
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12) | |||||
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48) | |||||
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192) | |||||
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768) | |||||
/* Lookup table-based algorithm from “Fast Half Float Conversions” | |||||
* by Jeroen van der Zijp, November 2008. No rounding is performed, | |||||
* and some NaN values may be incorrectly converted to Inf (because | |||||
* the lowest order bits in the mantissa are ignored). */ | |||||
static inline uint16_t float_to_half_nobranch(uint32_t x) | |||||
{ | |||||
static uint16_t const basetable[512] = | |||||
{ | |||||
#define S1(i) (((i) < 103) ? 0x0000u : \ | |||||
((i) < 113) ? 0x0400u >> (0x1f & (113 - (i))) : \ | |||||
((i) < 143) ? ((i) - 112) << 10 : 0x7c00u) | |||||
S256(0), | |||||
#undef S1 | |||||
#define S1(i) (uint16_t)(0x8000u | basetable[i]) | |||||
S256(0), | |||||
#undef S1 | |||||
}; | |||||
static uint8_t const shifttable[512] = | |||||
{ | |||||
#define S1(i) (((i) < 103) ? 24 : \ | |||||
((i) < 113) ? 126 - (i) : \ | |||||
((i) < 143 || (i) == 255) ? 13 : 24) | |||||
S256(0), S256(0), | |||||
#undef S1 | |||||
}; | |||||
uint16_t bits = basetable[(x >> 23) & 0x1ff]; | |||||
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff]; | |||||
return bits; | |||||
} | |||||
/* This method is faster than the OpenEXR implementation (very often | |||||
* used, eg. in Ogre), with the additional benefit of rounding, inspired | |||||
* by James Tursa’s half-precision code. */ | |||||
static inline uint16_t float_to_half_branch(uint32_t x) | |||||
{ | |||||
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */ | |||||
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */ | |||||
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ | |||||
/* If zero, or denormal, or exponent underflows too much for a denormal | |||||
* half, return signed zero. */ | |||||
if (e < 103) | |||||
return bits; | |||||
/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ | |||||
if (e > 142) | |||||
{ | |||||
bits |= 0x7c00u; | |||||
/* If exponent was 0xff and one mantissa bit was set, it means NaN, | |||||
* not Inf, so make sure we set one mantissa bit too. */ | |||||
bits |= e == 255 && (x & 0x007fffffu); | |||||
return bits; | |||||
} | |||||
/* If exponent underflows but not too much, return a denormal */ | |||||
if (e < 113) | |||||
{ | |||||
m |= 0x0800u; | |||||
/* Extra rounding may overflow and set mantissa to 0 and exponent | |||||
* to 1, which is OK. */ | |||||
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); | |||||
return bits; | |||||
} | |||||
bits |= ((e - 112) << 10) | (m >> 1); | |||||
/* Extra rounding. An overflow will set mantissa to 0 and increment | |||||
* the exponent, which is OK. */ | |||||
bits += m & 1; | |||||
return bits; | |||||
} | |||||
/* We use this magic table, inspired by De Bruijn sequences, to compute a | |||||
* branchless integer log2. The actual value fetched is 24-log2(x+1) for x | |||||
* in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. See | |||||
* http://lolengine.net/blog/2012/04/03/beyond-de-bruijn for an explanation | |||||
* of how the value 0x5a1a1a2u was obtained. */ | |||||
static uint32_t const shifttable[16] = | |||||
{ | |||||
23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0, | |||||
}; | |||||
static uint32_t const shiftmagic = 0x5a1a1a2u; | |||||
/* Lookup table-based algorithm from “Fast Half Float Conversions” | |||||
* by Jeroen van der Zijp, November 2008. Tables are generated using | |||||
* the C++ preprocessor, thanks to a branchless implementation also | |||||
* used in half_to_float_branch(). This code is very fast when performing | |||||
* conversions on arrays of values. */ | |||||
static inline uint32_t half_to_float_nobranch(uint16_t x) | |||||
{ | |||||
#define M3(i) ((i) | ((i) >> 1)) | |||||
#define M7(i) (M3(i) | (M3(i) >> 2)) | |||||
#define MF(i) (M7(i) | (M7(i) >> 4)) | |||||
#define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28] | |||||
static uint32_t const mantissatable[2048] = | |||||
{ | |||||
#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i))) | |||||
S1024(0), | |||||
#undef S1 | |||||
#define S1(i) (0x38000000u + ((i) << 13)) | |||||
S1024(0), | |||||
#undef S1 | |||||
}; | |||||
static uint32_t const exponenttable[64] = | |||||
{ | |||||
#define S1(i) (((i) == 0) ? 0 : \ | |||||
((i) < 31) ? ((uint32_t)(i) << 23) : \ | |||||
((i) == 31) ? 0x47800000u : \ | |||||
((i) == 32) ? 0x80000000u : \ | |||||
((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000) | |||||
S64(0), | |||||
#undef S1 | |||||
}; | |||||
static int const offsettable[64] = | |||||
{ | |||||
#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024) | |||||
S64(0), | |||||
#undef S1 | |||||
}; | |||||
return mantissatable[offsettable[x >> 10] + (x & 0x3ff)] | |||||
+ exponenttable[x >> 10]; | |||||
} | |||||
/* This algorithm is similar to the OpenEXR implementation, except it | |||||
* uses branchless code in the denormal path. This is slower than the | |||||
* table version, but will be more friendly to the cache for occasional | |||||
* uses. */ | |||||
static inline uint32_t half_to_float_branch(uint16_t x) | |||||
{ | |||||
uint32_t s = (x & 0x8000u) << 16; | |||||
if ((x & 0x7fffu) == 0) | |||||
return (uint32_t)x << 16; | |||||
uint32_t e = x & 0x7c00u; | |||||
uint32_t m = x & 0x03ffu; | |||||
if (e == 0) | |||||
{ | |||||
/* m has 10 significant bits but replicating the leading bit to | |||||
* 8 positions instead of 16 works just as well because of our | |||||
* handcrafted shiftmagic table. */ | |||||
uint32_t v = m | (m >> 1); | |||||
v |= v >> 2; | |||||
v |= v >> 4; | |||||
e = shifttable[(v * shiftmagic) >> 28]; | |||||
/* We don't have to remove the 10th mantissa bit because it gets | |||||
* added to our underestimated exponent. */ | |||||
return s | (((125 - e) << 23) + (m << e)); | |||||
} | |||||
if (e == 0x7c00u) | |||||
{ | |||||
/* The amd64 pipeline likes the if() better than a ternary operator | |||||
* or any other trick I could find. --sam */ | |||||
if (m == 0) | |||||
return s | 0x7f800000u; | |||||
return s | 0x7fc00000u; | |||||
} | |||||
return s | (((e >> 10) + 112) << 23) | (m << 13); | |||||
} | |||||
/* Constructor from float. Uses the non-branching version because benchmarks | |||||
* indicate it is about 80% faster on amd64, and 20% faster on the PS3. The | |||||
* penalty of loading the lookup tables does not seem important. */ | |||||
half half::makefast(float f) | |||||
{ | |||||
union { float f; uint32_t x; } u = { f }; | |||||
return makebits(float_to_half_nobranch(u.x)); | |||||
} | |||||
/* Constructor from float with better precision. */ | |||||
half half::makeaccurate(float f) | |||||
{ | |||||
union { float f; uint32_t x; } u = { f }; | |||||
return makebits(float_to_half_branch(u.x)); | |||||
} | |||||
/* Cast to float. Uses the branching version because loading the tables | |||||
* for only one value is going to be cache-expensive. */ | |||||
half::operator float() const | |||||
{ | |||||
union { float f; uint32_t x; } u; | |||||
u.x = half_to_float_branch(bits); | |||||
return u.f; | |||||
} | |||||
void half::convert(half *dst, float const *src, size_t nelem) | |||||
{ | |||||
for (size_t i = 0; i < nelem; i++) | |||||
{ | |||||
union { float f; uint32_t x; } u; | |||||
u.f = *src++; | |||||
*dst++ = makebits(float_to_half_nobranch(u.x)); | |||||
} | |||||
} | |||||
void half::convert(float *dst, half const *src, size_t nelem) | |||||
{ | |||||
for (size_t i = 0; i < nelem; i++) | |||||
{ | |||||
union { float f; uint32_t x; } u; | |||||
/* This code is really too slow on the PS3, even with the denormal | |||||
* handling stripped off. */ | |||||
u.x = half_to_float_nobranch((*src++).bits); | |||||
*dst++ = u.f; | |||||
} | |||||
} | |||||
} /* namespace lol */ | |||||