diff --git a/TODO.md b/TODO.md index 3d77cae7..7b9b7bf6 100644 --- a/TODO.md +++ b/TODO.md @@ -5,7 +5,6 @@ src/lol/math/bigint.h src/lol/math/constants.h src/lol/math/functions.h - src/lol/math/half.h (needs half.cpp) src/lol/math/noise/* diff --git a/legacy/lol/math/half.h b/include/lol/math/half.h similarity index 52% rename from legacy/lol/math/half.h rename to include/lol/math/half.h index 441e64b2..ee5373d1 100644 --- a/legacy/lol/math/half.h +++ b/include/lol/math/half.h @@ -1,7 +1,7 @@ // // Lol Engine // -// Copyright © 2010—2019 Sam Hocevar +// Copyright © 2010—2020 Sam Hocevar // // Lol Engine is free software. It comes without any warranty, to // the extent permitted by applicable law. You can redistribute it @@ -17,18 +17,19 @@ // -------------- // -#include - #include -#include -#include +#include // std::ostream +#include // uint32_t etc. namespace lol { -/* This is OUR namespace. Don't let Windows headers mess with it. */ -#undef min -#undef max +#if _WIN32 +# pragma push_macro("near") +# pragma push_macro("far") +# undef near +# undef far +#endif namespace half_ops { struct base {}; } @@ -36,39 +37,85 @@ class [[nodiscard]] half : half_ops::base { public: - /* Constructors. Always inline so that the code can work in registers - * instead of calling routines with the hidden "this" parameter. */ - inline half() { } - inline half(int f) { *this = makefast((float)f); } - inline half(float f) { *this = makefast(f); } - inline half(double f) { *this = makefast((float)f); } - inline half(ldouble f) { *this = makefast((float)f); } + // Constructors. Always inline so that the code can work in registers. + inline half() {} + explicit inline half(int f) : half(float(f)) {} + explicit inline half(double f) : half(float(f)) {} + explicit inline half(ldouble f) : half(float(f)) {} + + explicit inline half(float f) + { + union { float f; uint32_t x; } u = { f }; + + // This method is faster than the OpenEXR implementation (very often + // used, eg. in Ogre), with the additional benefit of rounding, inspired + // by James Tursa’s half-precision code. + m_bits = (u.x >> 16) & 0x8000; // Get the sign + uint16_t m = (u.x >> 12) & 0x07ff; // Keep one extra bit for rounding + unsigned int e = (u.x >> 23) & 0xff; // Using int is faster here + + if (e < 103) + { + // If zero, or denormal, or exponent underflows too much for a denormal + // half, return signed zero. + } + else if (e > 142) + { + // If NaN, return NaN. If Inf or exponent overflow, return Inf. + m_bits |= 0x7c00u; + // If exponent was 0xff and one mantissa bit was set, it means NaN, + // not Inf, so make sure we set one mantissa bit too. + m_bits |= e == 255 && (u.x & 0x007fffffu); + } + else if (e < 113) + { + // If exponent underflows but not too much, return a denormal + m |= 0x0800u; + // Extra rounding may overflow and set mantissa to 0 and exponent + // to 1, which is OK. + m_bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); + } + else + { + m_bits |= ((e - 112) << 10) | (m >> 1); + // Extra rounding. An overflow will set mantissa to 0 and increment + // the exponent, which is OK. + m_bits += m & 1; + } + } + [[nodiscard]] inline int is_nan() const { - return ((bits & 0x7c00u) == 0x7c00u) && (bits & 0x03ffu); + return ((m_bits & 0x7c00u) == 0x7c00u) && (m_bits & 0x03ffu); } [[nodiscard]] inline int is_finite() const { - return (bits & 0x7c00u) != 0x7c00u; + return (m_bits & 0x7c00u) != 0x7c00u; } [[nodiscard]] inline int is_inf() const { - return (uint16_t)(bits << 1) == (0x7c00u << 1); + return (uint16_t)(m_bits << 1) == (0x7c00u << 1); } [[nodiscard]] inline int is_normal() const { - return (is_finite() && (bits & 0x7c00u)) || ((bits & 0x7fffu) == 0); + return (is_finite() && (m_bits & 0x7c00u)) || ((m_bits & 0x7fffu) == 0); } - /* Cast to other types -- always inline, see constructors */ - inline half &operator =(int f) { return *this = makefast((float)f); } - inline half &operator =(float f) { return *this = makefast(f); } - inline half &operator =(double f) { return *this = makefast((float)f); } - inline half &operator =(ldouble f) { return *this = makefast((float)f); } + // Convert to string + friend std::ostream &operator<<(std::ostream &stream, half const &v) + { + return stream << float(v); + } + + // Cast to other types -- always inline, see constructors + inline half &operator =(int f) { return *this = half(f); } + inline half &operator =(float f) { return *this = half(f); } + inline half &operator =(double f) { return *this = half(f); } + inline half &operator =(ldouble f) { return *this = half(f); } [[nodiscard]] inline operator int8_t() const { return (int8_t)(float)*this; } [[nodiscard]] inline operator uint8_t() const { return (uint8_t)(float)*this; } [[nodiscard]] inline operator int16_t() const { return (int16_t)(float)*this; } @@ -78,15 +125,16 @@ public: [[nodiscard]] inline operator int64_t() const { return (int64_t)(float)*this; } [[nodiscard]] inline operator uint64_t() const { return (uint64_t)(float)*this; } - [[nodiscard]] operator float() const; + [[nodiscard]] inline operator float() const + { + union { uint32_t x; float f; } u = { tofloatbits() }; + return u.f; + } + [[nodiscard]] inline operator double() const { return (float)(*this); } [[nodiscard]] inline operator ldouble() const { return (float)(*this); } - /* Array conversions */ - static void convert(half *dst, float const *src, size_t nelem); - static void convert(float *dst, half const *src, size_t nelem); - - /* Operations */ + // Operators [[nodiscard]] bool operator ==(half x) const { return (float)*this == (float)x; } [[nodiscard]] bool operator !=(half x) const { return (float)*this != (float)x; } [[nodiscard]] bool operator <(half x) const { return (float)*this < (float)x; } @@ -94,10 +142,10 @@ public: [[nodiscard]] bool operator <=(half x) const { return (float)*this <= (float)x; } [[nodiscard]] bool operator >=(half x) const { return (float)*this >= (float)x; } - [[nodiscard]] bool operator !() const { return !(bits & 0x7fffu); } + [[nodiscard]] bool operator !() const { return !(m_bits & 0x7fffu); } [[nodiscard]] operator bool() const { return !!*this; } - inline half operator -() const { return makebits(bits ^ 0x8000u); } + inline half operator -() const { return frombits(m_bits ^ 0x8000u); } inline half operator +() const { return *this; } inline half &operator +=(half h) { return (*this = (half)(*this + h)); } inline half &operator -=(half h) { return (*this = (half)(*this - h)); } @@ -109,25 +157,78 @@ public: [[nodiscard]] inline float operator *(half h) const { return (float)*this * (float)h; } [[nodiscard]] inline float operator /(half h) const { return (float)*this / (float)h; } - /* Factories */ - static half makefast(float f); - static half makeaccurate(float f); - static inline half makebits(uint16_t x) + inline uint16_t bits() const { return m_bits; } + + static inline half frombits(uint16_t x) { half ret; - ret.bits = x; + ret.m_bits = x; return ret; } - /* Internal representation */ - uint16_t bits; +private: + inline uint32_t tofloatbits() const + { + // This algorithm is similar to the OpenEXR implementation, except it + // uses branchless code in the denormal path. This is slower than the + // table version, but will be more friendly to the cache for occasional + // uses. + uint32_t s = (m_bits & 0x8000u) << 16; + + if ((m_bits & 0x7fffu) == 0) + return (uint32_t)m_bits << 16; + + uint32_t e = m_bits & 0x7c00u; + uint32_t m = m_bits & 0x03ffu; + + if (e == 0) + { + // We use this magic table, inspired by De Bruijn sequences, to + // compute a branchless integer log2. The actual value fetched is + // 24-log2(x+1) for x in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, + // 3fd, 3fe, 3ff. For an explanation of how 0x5a1a1a2u was obtained + // see: http://lolengine.net/blog/2012/04/03/beyond-de-bruijn + static uint32_t const shifttable[16] = + { + 23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0, + }; + static uint32_t const shiftmagic = 0x5a1a1a2u; + + /* m has 10 significant bits but replicating the leading bit to + * 8 positions instead of 16 works just as well because of our + * handcrafted shiftmagic table. */ + uint32_t v = m | (m >> 1); + v |= v >> 2; + v |= v >> 4; + + e = shifttable[(v * shiftmagic) >> 28]; + + /* We don't have to remove the 10th mantissa bit because it gets + * added to our underestimated exponent. */ + return s | (((125 - e) << 23) + (m << e)); + } + + if (e == 0x7c00u) + { + /* The amd64 pipeline likes the if() better than a ternary operator + * or any other trick I could find. --sam */ + if (m == 0) + return s | 0x7f800000u; + return s | 0x7fc00000u; + } + + return s | (((e >> 10) + 112) << 23) | (m << 13); + } + + // Internal representation + uint16_t m_bits; }; static_assert(sizeof(half) == 2, "sizeof(half) == 2"); -/* - * Standard math and GLSL functions - */ +// +// Standard math and GLSL functions +// static inline half min(half a, half b) { return a < b ? a : b; } static inline half max(half a, half b) { return a > b ? a : b; } @@ -139,20 +240,20 @@ static inline float fmod(half a, half b) static inline float fract(half a) { return fract((float)a); } static inline float degrees(half a) { return degrees((float)a); } static inline float radians(half a) { return radians((float)a); } -static inline half abs(half a) { return half::makebits(a.bits & 0x7fffu); } +static inline half abs(half a) { return half::frombits(a.bits() & 0x7fffu); } static inline half clamp(half x, half a, half b) { return (x < a) ? a : (x > b) ? b : x; } -/* - * Standard math operators - */ +// +// Standard math operators +// namespace half_ops { - /* Enumerate the types for which operations with half are valid */ + // Enumerate the types for which operations with half are valid template struct valid {}; template struct valid @@ -227,7 +328,11 @@ DECLARE_HALF_BOOL_OPS(<=) #undef DECLARE_HALF_BOOL_OPS -} /* namespace half_ops */ +} // namespace half_ops -} /* namespace lol */ +} // namespace lol +#if _WIN32 +# pragma pop_macro("near") +# pragma pop_macro("far") +#endif diff --git a/include/lol/math/private/matrix.h b/include/lol/math/private/matrix.h index e0806f5d..d73d2a0f 100644 --- a/include/lol/math/private/matrix.h +++ b/include/lol/math/private/matrix.h @@ -29,8 +29,6 @@ # undef far #endif -#define have_lol_matrix_h - namespace lol { @@ -120,7 +118,6 @@ private: }; static_assert(sizeof(imat2) == 16, "sizeof(imat2) == 16"); -static_assert(sizeof(f16mat2) == 8, "sizeof(f16mat2) == 8"); static_assert(sizeof(mat2) == 16, "sizeof(mat2) == 16"); static_assert(sizeof(dmat2) == 32, "sizeof(dmat2) == 32"); @@ -211,7 +208,6 @@ private: }; static_assert(sizeof(imat3) == 36, "sizeof(imat3) == 36"); -static_assert(sizeof(f16mat3) == 18, "sizeof(f16mat3) == 18"); static_assert(sizeof(mat3) == 36, "sizeof(mat3) == 36"); static_assert(sizeof(dmat3) == 72, "sizeof(dmat3) == 72"); @@ -345,7 +341,6 @@ private: }; static_assert(sizeof(imat4) == 64, "sizeof(imat4) == 64"); -static_assert(sizeof(f16mat4) == 32, "sizeof(f16mat4) == 32"); static_assert(sizeof(mat4) == 64, "sizeof(mat4) == 64"); static_assert(sizeof(dmat4) == 128, "sizeof(dmat4) == 128"); diff --git a/include/lol/math/private/ops.h b/include/lol/math/private/ops.h index c7c9affe..edafb5b4 100644 --- a/include/lol/math/private/ops.h +++ b/include/lol/math/private/ops.h @@ -20,8 +20,6 @@ #include #include -#include - namespace lol { diff --git a/include/lol/math/transform.h b/include/lol/math/transform.h index 1bdeebad..4927a731 100644 --- a/include/lol/math/transform.h +++ b/include/lol/math/transform.h @@ -71,7 +71,6 @@ struct [[nodiscard]] cmplx_t : public linear_ops::base T x, y; }; -static_assert(sizeof(f16cmplx) == 4, "sizeof(f16cmplx) == 4"); static_assert(sizeof(cmplx) == 8, "sizeof(cmplx) == 8"); static_assert(sizeof(dcmplx) == 16, "sizeof(dcmplx) == 16"); @@ -250,7 +249,6 @@ struct [[nodiscard]] quat_t : public linear_ops::base T w, x, y, z; }; -static_assert(sizeof(f16quat) == 8, "sizeof(f16quat) == 8"); static_assert(sizeof(quat) == 16, "sizeof(quat) == 16"); static_assert(sizeof(dquat) == 32, "sizeof(dquat) == 32"); diff --git a/include/lol/math/vector.h b/include/lol/math/vector.h index bd9bad3c..5079bc51 100644 --- a/include/lol/math/vector.h +++ b/include/lol/math/vector.h @@ -20,7 +20,7 @@ #include #include -#include +#include // std::ostream #include namespace lol @@ -301,7 +301,6 @@ static_assert(sizeof(i16vec2) == 4, "sizeof(i16vec2) == 4"); static_assert(sizeof(ivec2) == 8, "sizeof(ivec2) == 8"); static_assert(sizeof(i64vec2) == 16, "sizeof(i64vec2) == 16"); -static_assert(sizeof(f16vec2) == 4, "sizeof(f16vec2) == 4"); static_assert(sizeof(vec2) == 8, "sizeof(vec2) == 8"); static_assert(sizeof(dvec2) == 16, "sizeof(dvec2) == 16"); @@ -542,7 +541,6 @@ static_assert(sizeof(i16vec3) == 6, "sizeof(i16vec3) == 6"); static_assert(sizeof(ivec3) == 12, "sizeof(ivec3) == 12"); static_assert(sizeof(i64vec3) == 24, "sizeof(i64vec3) == 24"); -static_assert(sizeof(f16vec3) == 6, "sizeof(f16vec3) == 6"); static_assert(sizeof(vec3) == 12, "sizeof(vec3) == 12"); static_assert(sizeof(dvec3) == 24, "sizeof(dvec3) == 24"); @@ -975,7 +973,6 @@ static_assert(sizeof(i16vec4) == 8, "sizeof(i16vec4) == 8"); static_assert(sizeof(ivec4) == 16, "sizeof(ivec4) == 16"); static_assert(sizeof(i64vec4) == 32, "sizeof(i64vec4) == 32"); -static_assert(sizeof(f16vec4) == 8, "sizeof(f16vec4) == 8"); static_assert(sizeof(vec4) == 16, "sizeof(vec4) == 16"); static_assert(sizeof(dvec4) == 32, "sizeof(dvec4) == 32"); diff --git a/legacy/math/half.cpp b/legacy/math/half.cpp deleted file mode 100644 index ad7f95f1..00000000 --- a/legacy/math/half.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// -// Lol Engine -// -// Copyright © 2010—2019 Sam Hocevar -// -// Lol Engine is free software. It comes without any warranty, to -// the extent permitted by applicable law. You can redistribute it -// and/or modify it under the terms of the Do What the Fuck You Want -// to Public License, Version 2, as published by the WTFPL Task Force. -// See http://www.wtfpl.net/ for more details. -// - -#include - -namespace lol -{ - -/* These macros implement a finite iterator useful to build lookup - * tables. For instance, S64(0) will call S1(x) for all values of x - * between 0 and 63. - * Due to the exponential behaviour of the calls, the stress on the - * compiler may be important. */ -#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3) -#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12) -#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48) -#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192) -#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768) - -/* Lookup table-based algorithm from “Fast Half Float Conversions” - * by Jeroen van der Zijp, November 2008. No rounding is performed, - * and some NaN values may be incorrectly converted to Inf (because - * the lowest order bits in the mantissa are ignored). */ -static inline uint16_t float_to_half_nobranch(uint32_t x) -{ - static uint16_t const basetable[512] = - { -#define S1(i) (((i) < 103) ? 0x0000u : \ - ((i) < 113) ? 0x0400u >> (0x1f & (113 - (i))) : \ - ((i) < 143) ? ((i) - 112) << 10 : 0x7c00u) - S256(0), -#undef S1 -#define S1(i) (uint16_t)(0x8000u | basetable[i]) - S256(0), -#undef S1 - }; - - static uint8_t const shifttable[512] = - { -#define S1(i) (((i) < 103) ? 24 : \ - ((i) < 113) ? 126 - (i) : \ - ((i) < 143 || (i) == 255) ? 13 : 24) - S256(0), S256(0), -#undef S1 - }; - - uint16_t bits = basetable[(x >> 23) & 0x1ff]; - bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff]; - return bits; -} - -/* This method is faster than the OpenEXR implementation (very often - * used, eg. in Ogre), with the additional benefit of rounding, inspired - * by James Tursa’s half-precision code. */ -static inline uint16_t float_to_half_branch(uint32_t x) -{ - uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */ - uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */ - unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ - - /* If zero, or denormal, or exponent underflows too much for a denormal - * half, return signed zero. */ - if (e < 103) - return bits; - - /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ - if (e > 142) - { - bits |= 0x7c00u; - /* If exponent was 0xff and one mantissa bit was set, it means NaN, - * not Inf, so make sure we set one mantissa bit too. */ - bits |= e == 255 && (x & 0x007fffffu); - return bits; - } - - /* If exponent underflows but not too much, return a denormal */ - if (e < 113) - { - m |= 0x0800u; - /* Extra rounding may overflow and set mantissa to 0 and exponent - * to 1, which is OK. */ - bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); - return bits; - } - - bits |= ((e - 112) << 10) | (m >> 1); - /* Extra rounding. An overflow will set mantissa to 0 and increment - * the exponent, which is OK. */ - bits += m & 1; - return bits; -} - -/* We use this magic table, inspired by De Bruijn sequences, to compute a - * branchless integer log2. The actual value fetched is 24-log2(x+1) for x - * in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. See - * http://lolengine.net/blog/2012/04/03/beyond-de-bruijn for an explanation - * of how the value 0x5a1a1a2u was obtained. */ -static uint32_t const shifttable[16] = -{ - 23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0, -}; -static uint32_t const shiftmagic = 0x5a1a1a2u; - -/* Lookup table-based algorithm from “Fast Half Float Conversions” - * by Jeroen van der Zijp, November 2008. Tables are generated using - * the C++ preprocessor, thanks to a branchless implementation also - * used in half_to_float_branch(). This code is very fast when performing - * conversions on arrays of values. */ -static inline uint32_t half_to_float_nobranch(uint16_t x) -{ -#define M3(i) ((i) | ((i) >> 1)) -#define M7(i) (M3(i) | (M3(i) >> 2)) -#define MF(i) (M7(i) | (M7(i) >> 4)) -#define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28] - - static uint32_t const mantissatable[2048] = - { -#define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i))) - S1024(0), -#undef S1 -#define S1(i) (0x38000000u + ((i) << 13)) - S1024(0), -#undef S1 - }; - - static uint32_t const exponenttable[64] = - { -#define S1(i) (((i) == 0) ? 0 : \ - ((i) < 31) ? ((uint32_t)(i) << 23) : \ - ((i) == 31) ? 0x47800000u : \ - ((i) == 32) ? 0x80000000u : \ - ((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000) - S64(0), -#undef S1 - }; - - static int const offsettable[64] = - { -#define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024) - S64(0), -#undef S1 - }; - - return mantissatable[offsettable[x >> 10] + (x & 0x3ff)] - + exponenttable[x >> 10]; -} - -/* This algorithm is similar to the OpenEXR implementation, except it - * uses branchless code in the denormal path. This is slower than the - * table version, but will be more friendly to the cache for occasional - * uses. */ -static inline uint32_t half_to_float_branch(uint16_t x) -{ - uint32_t s = (x & 0x8000u) << 16; - - if ((x & 0x7fffu) == 0) - return (uint32_t)x << 16; - - uint32_t e = x & 0x7c00u; - uint32_t m = x & 0x03ffu; - - if (e == 0) - { - /* m has 10 significant bits but replicating the leading bit to - * 8 positions instead of 16 works just as well because of our - * handcrafted shiftmagic table. */ - uint32_t v = m | (m >> 1); - v |= v >> 2; - v |= v >> 4; - - e = shifttable[(v * shiftmagic) >> 28]; - - /* We don't have to remove the 10th mantissa bit because it gets - * added to our underestimated exponent. */ - return s | (((125 - e) << 23) + (m << e)); - } - - if (e == 0x7c00u) - { - /* The amd64 pipeline likes the if() better than a ternary operator - * or any other trick I could find. --sam */ - if (m == 0) - return s | 0x7f800000u; - return s | 0x7fc00000u; - } - - return s | (((e >> 10) + 112) << 23) | (m << 13); -} - -/* Constructor from float. Uses the non-branching version because benchmarks - * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The - * penalty of loading the lookup tables does not seem important. */ -half half::makefast(float f) -{ - union { float f; uint32_t x; } u = { f }; - return makebits(float_to_half_nobranch(u.x)); -} - -/* Constructor from float with better precision. */ -half half::makeaccurate(float f) -{ - union { float f; uint32_t x; } u = { f }; - return makebits(float_to_half_branch(u.x)); -} - -/* Cast to float. Uses the branching version because loading the tables - * for only one value is going to be cache-expensive. */ -half::operator float() const -{ - union { float f; uint32_t x; } u; - u.x = half_to_float_branch(bits); - return u.f; -} - -void half::convert(half *dst, float const *src, size_t nelem) -{ - for (size_t i = 0; i < nelem; i++) - { - union { float f; uint32_t x; } u; - u.f = *src++; - *dst++ = makebits(float_to_half_nobranch(u.x)); - } -} - -void half::convert(float *dst, half const *src, size_t nelem) -{ - for (size_t i = 0; i < nelem; i++) - { - union { float f; uint32_t x; } u; - - /* This code is really too slow on the PS3, even with the denormal - * handling stripped off. */ - u.x = half_to_float_nobranch((*src++).bits); - *dst++ = u.f; - } -} - -} /* namespace lol */ -