Clean up the half.h header and make it optional.

4 anni fa · 59a4b5bd0c
--- a/TODO.md
+++ b/TODO.md
@@ -5,7 +5,6 @@
    src/lol/math/bigint.h
    src/lol/math/constants.h
    src/lol/math/functions.h
    src/lol/math/half.h (needs half.cpp)
    src/lol/math/noise/*


--- a/include/lol/math/half.h
+++ b/include/lol/math/half.h
@@ -1,7 +1,7 @@
 //
 //  Lol Engine
 //
 //  Copyright © 2010—2019 Sam Hocevar <sam@hocevar.net>
 //  Copyright © 2010—2020 Sam Hocevar <sam@hocevar.net>
 //
 //  Lol Engine is free software. It comes without any warranty, to
 //  the extent permitted by applicable law. You can redistribute it
@@ -17,18 +17,19 @@
 // --------------
 //

 #include <lol/base/types.h>

 #include <cmath>
 #include <cstdio>
 #include <stdint.h>
 #include <ostream>  // std::ostream
 #include <stdint.h> // uint32_t etc.

 namespace lol
 {

 /* This is OUR namespace. Don't let Windows headers mess with it. */
 #undef min
 #undef max
 #if _WIN32
 #   pragma push_macro("near")
 #   pragma push_macro("far")
 #   undef near
 #   undef far
 #endif

 namespace half_ops { struct base {}; }

@@ -36,39 +37,85 @@ class [[nodiscard]] half
  : half_ops::base
 {
 public:
    /* Constructors. Always inline so that the code can work in registers
     * instead of calling routines with the hidden "this" parameter. */
    inline half() { }
    inline half(int f) { *this = makefast((float)f); }
    inline half(float f) { *this = makefast(f); }
    inline half(double f) { *this = makefast((float)f); }
    inline half(ldouble f) { *this = makefast((float)f); }
    // Constructors. Always inline so that the code can work in registers.
    inline half() {}
    explicit inline half(int f) : half(float(f)) {}
    explicit inline half(double f) : half(float(f)) {}
    explicit inline half(ldouble f) : half(float(f)) {}

    explicit inline half(float f)
    {
        union { float f; uint32_t x; } u = { f };

        // This method is faster than the OpenEXR implementation (very often
        // used, eg. in Ogre), with the additional benefit of rounding, inspired
        // by James Tursa’s half-precision code.
        m_bits = (u.x >> 16) & 0x8000;       // Get the sign
        uint16_t m = (u.x >> 12) & 0x07ff;   // Keep one extra bit for rounding
        unsigned int e = (u.x >> 23) & 0xff; // Using int is faster here

        if (e < 103)
        {
            // If zero, or denormal, or exponent underflows too much for a denormal
            // half, return signed zero.
        }
        else if (e > 142)
        {
            // If NaN, return NaN. If Inf or exponent overflow, return Inf.
            m_bits |= 0x7c00u;
            // If exponent was 0xff and one mantissa bit was set, it means NaN,
            // not Inf, so make sure we set one mantissa bit too.
            m_bits |= e == 255 && (u.x & 0x007fffffu);
        }
        else if (e < 113)
        {
            // If exponent underflows but not too much, return a denormal
            m |= 0x0800u;
            // Extra rounding may overflow and set mantissa to 0 and exponent
            // to 1, which is OK.
            m_bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
        }
        else
        {
            m_bits |= ((e - 112) << 10) | (m >> 1);
            // Extra rounding. An overflow will set mantissa to 0 and increment
            // the exponent, which is OK.
            m_bits += m & 1;
        }
    }


    [[nodiscard]] inline int is_nan() const
    {
        return ((bits & 0x7c00u) == 0x7c00u) && (bits & 0x03ffu);
        return ((m_bits & 0x7c00u) == 0x7c00u) && (m_bits & 0x03ffu);
    }

    [[nodiscard]] inline int is_finite() const
    {
        return (bits & 0x7c00u) != 0x7c00u;
        return (m_bits & 0x7c00u) != 0x7c00u;
    }

    [[nodiscard]] inline int is_inf() const
    {
        return (uint16_t)(bits << 1) == (0x7c00u << 1);
        return (uint16_t)(m_bits << 1) == (0x7c00u << 1);
    }

    [[nodiscard]] inline int is_normal() const
    {
        return (is_finite() && (bits & 0x7c00u)) || ((bits & 0x7fffu) == 0);
        return (is_finite() && (m_bits & 0x7c00u)) || ((m_bits & 0x7fffu) == 0);
    }

    /* Cast to other types -- always inline, see constructors */
    inline half &operator =(int f) { return *this = makefast((float)f); }
    inline half &operator =(float f) { return *this = makefast(f); }
    inline half &operator =(double f) { return *this = makefast((float)f); }
    inline half &operator =(ldouble f) { return *this = makefast((float)f); }
    // Convert to string
    friend std::ostream &operator<<(std::ostream &stream, half const &v)
    {
        return stream << float(v);
    }

    // Cast to other types -- always inline, see constructors
    inline half &operator =(int f) { return *this = half(f); }
    inline half &operator =(float f) { return *this = half(f); }
    inline half &operator =(double f) { return *this = half(f); }
    inline half &operator =(ldouble f) { return *this = half(f); }
    [[nodiscard]] inline operator int8_t() const { return (int8_t)(float)*this; }
    [[nodiscard]] inline operator uint8_t() const { return (uint8_t)(float)*this; }
    [[nodiscard]] inline operator int16_t() const { return (int16_t)(float)*this; }
@@ -78,15 +125,16 @@ public:
    [[nodiscard]] inline operator int64_t() const { return (int64_t)(float)*this; }
    [[nodiscard]] inline operator uint64_t() const { return (uint64_t)(float)*this; }

    [[nodiscard]] operator float() const;
    [[nodiscard]] inline operator float() const
    {
        union { uint32_t x; float f; } u = { tofloatbits() };
        return u.f;
    }

    [[nodiscard]] inline operator double() const { return (float)(*this); }
    [[nodiscard]] inline operator ldouble() const { return (float)(*this); }

    /* Array conversions */
    static void convert(half *dst, float const *src, size_t nelem);
    static void convert(float *dst, half const *src, size_t nelem);

    /* Operations */
    // Operators
    [[nodiscard]] bool operator ==(half x) const { return (float)*this == (float)x; }
    [[nodiscard]] bool operator !=(half x) const { return (float)*this != (float)x; }
    [[nodiscard]] bool operator <(half x) const { return (float)*this < (float)x; }
@@ -94,10 +142,10 @@ public:
    [[nodiscard]] bool operator <=(half x) const { return (float)*this <= (float)x; }
    [[nodiscard]] bool operator >=(half x) const { return (float)*this >= (float)x; }

    [[nodiscard]] bool operator !() const { return !(bits & 0x7fffu); }
    [[nodiscard]] bool operator !() const { return !(m_bits & 0x7fffu); }
    [[nodiscard]] operator bool() const { return !!*this; }

    inline half operator -() const { return makebits(bits ^ 0x8000u); }
    inline half operator -() const { return frombits(m_bits ^ 0x8000u); }
    inline half operator +() const { return *this; }
    inline half &operator +=(half h) { return (*this = (half)(*this + h)); }
    inline half &operator -=(half h) { return (*this = (half)(*this - h)); }
@@ -109,25 +157,78 @@ public:
    [[nodiscard]] inline float operator *(half h) const { return (float)*this * (float)h; }
    [[nodiscard]] inline float operator /(half h) const { return (float)*this / (float)h; }

    /* Factories */
    static half makefast(float f);
    static half makeaccurate(float f);
    static inline half makebits(uint16_t x)
    inline uint16_t bits() const { return m_bits; }

    static inline half frombits(uint16_t x)
    {
        half ret;
        ret.bits = x;
        ret.m_bits = x;
        return ret;
    }

    /* Internal representation */
    uint16_t bits;
 private:
    inline uint32_t tofloatbits() const
    {
        // This algorithm is similar to the OpenEXR implementation, except it
        // uses branchless code in the denormal path. This is slower than the
        // table version, but will be more friendly to the cache for occasional
        // uses.
        uint32_t s = (m_bits & 0x8000u) << 16;

        if ((m_bits & 0x7fffu) == 0)
            return (uint32_t)m_bits << 16;

        uint32_t e = m_bits & 0x7c00u;
        uint32_t m = m_bits & 0x03ffu;

        if (e == 0)
        {
            // We use this magic table, inspired by De Bruijn sequences, to
            // compute a branchless integer log2. The actual value fetched is
            // 24-log2(x+1) for x in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc,
            // 3fd, 3fe, 3ff. For an explanation of how 0x5a1a1a2u was obtained
            // see: http://lolengine.net/blog/2012/04/03/beyond-de-bruijn
            static uint32_t const shifttable[16] =
            {
                23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0,
            };
            static uint32_t const shiftmagic = 0x5a1a1a2u;

            /* m has 10 significant bits but replicating the leading bit to
             * 8 positions instead of 16 works just as well because of our
             * handcrafted shiftmagic table. */
            uint32_t v = m | (m >> 1);
            v |= v >> 2;
            v |= v >> 4;

            e = shifttable[(v * shiftmagic) >> 28];

            /* We don't have to remove the 10th mantissa bit because it gets
             * added to our underestimated exponent. */
            return s | (((125 - e) << 23) + (m << e));
        }

        if (e == 0x7c00u)
        {
            /* The amd64 pipeline likes the if() better than a ternary operator
             * or any other trick I could find. --sam */
            if (m == 0)
                return s | 0x7f800000u;
            return s | 0x7fc00000u;
        }

        return s | (((e >> 10) + 112) << 23) | (m << 13);
    }

    // Internal representation
    uint16_t m_bits;
 };

 static_assert(sizeof(half) == 2, "sizeof(half) == 2");

 /*
 * Standard math and GLSL functions
 */
 //
 // Standard math and GLSL functions
 //

 static inline half min(half a, half b) { return a < b ? a : b; }
 static inline half max(half a, half b) { return a > b ? a : b; }
@@ -139,20 +240,20 @@ static inline float fmod(half a, half b)
 static inline float fract(half a) { return fract((float)a); }
 static inline float degrees(half a) { return degrees((float)a); }
 static inline float radians(half a) { return radians((float)a); }
 static inline half abs(half a) { return half::makebits(a.bits & 0x7fffu); }
 static inline half abs(half a) { return half::frombits(a.bits() & 0x7fffu); }

 static inline half clamp(half x, half a, half b)
 {
    return (x < a) ? a : (x > b) ? b : x;
 }

 /*
 * Standard math operators
 */
 //
 // Standard math operators
 //

 namespace half_ops
 {
    /* Enumerate the types for which operations with half are valid */
    // Enumerate the types for which operations with half are valid
    template<typename FROM, typename TO = void> struct valid {};

    template<typename TO> struct valid<uint8_t, TO>
@@ -227,7 +328,11 @@ DECLARE_HALF_BOOL_OPS(<=)

 #undef DECLARE_HALF_BOOL_OPS

 } /* namespace half_ops */
 } // namespace half_ops

 } /* namespace lol */
 } // namespace lol

 #if _WIN32
 #   pragma pop_macro("near")
 #   pragma pop_macro("far")
 #endif
--- a/include/lol/math/private/matrix.h
+++ b/include/lol/math/private/matrix.h
@@ -29,8 +29,6 @@
 #   undef far
 #endif

 #define have_lol_matrix_h

 namespace lol
 {

@@ -120,7 +118,6 @@ private:
 };

 static_assert(sizeof(imat2) == 16, "sizeof(imat2) == 16");
 static_assert(sizeof(f16mat2) == 8, "sizeof(f16mat2) == 8");
 static_assert(sizeof(mat2) == 16, "sizeof(mat2) == 16");
 static_assert(sizeof(dmat2) == 32, "sizeof(dmat2) == 32");

@@ -211,7 +208,6 @@ private:
 };

 static_assert(sizeof(imat3) == 36, "sizeof(imat3) == 36");
 static_assert(sizeof(f16mat3) == 18, "sizeof(f16mat3) == 18");
 static_assert(sizeof(mat3) == 36, "sizeof(mat3) == 36");
 static_assert(sizeof(dmat3) == 72, "sizeof(dmat3) == 72");

@@ -345,7 +341,6 @@ private:
 };

 static_assert(sizeof(imat4) == 64, "sizeof(imat4) == 64");
 static_assert(sizeof(f16mat4) == 32, "sizeof(f16mat4) == 32");
 static_assert(sizeof(mat4) == 64, "sizeof(mat4) == 64");
 static_assert(sizeof(dmat4) == 128, "sizeof(dmat4) == 128");

--- a/include/lol/math/private/ops.h
+++ b/include/lol/math/private/ops.h
@@ -20,8 +20,6 @@
 #include <ostream>
 #include <type_traits>

 #include <lol/math/half.h>

 namespace lol
 {

--- a/include/lol/math/transform.h
+++ b/include/lol/math/transform.h
@@ -71,7 +71,6 @@ struct [[nodiscard]] cmplx_t : public linear_ops::base<T>
    T x, y;
 };

 static_assert(sizeof(f16cmplx) == 4, "sizeof(f16cmplx) == 4");
 static_assert(sizeof(cmplx) == 8, "sizeof(cmplx) == 8");
 static_assert(sizeof(dcmplx) == 16, "sizeof(dcmplx) == 16");

@@ -250,7 +249,6 @@ struct [[nodiscard]] quat_t : public linear_ops::base<T>
    T w, x, y, z;
 };

 static_assert(sizeof(f16quat) == 8, "sizeof(f16quat) == 8");
 static_assert(sizeof(quat) == 16, "sizeof(quat) == 16");
 static_assert(sizeof(dquat) == 32, "sizeof(dquat) == 32");

--- a/include/lol/math/vector.h
+++ b/include/lol/math/vector.h
@@ -20,7 +20,7 @@
 #include <lol/math/private/ops.h>

 #include <cassert>
 #include <ostream>
 #include <ostream>     // std::ostream
 #include <type_traits>

 namespace lol
@@ -301,7 +301,6 @@ static_assert(sizeof(i16vec2) == 4, "sizeof(i16vec2) == 4");
 static_assert(sizeof(ivec2) == 8, "sizeof(ivec2) == 8");
 static_assert(sizeof(i64vec2) == 16, "sizeof(i64vec2) == 16");

 static_assert(sizeof(f16vec2) == 4, "sizeof(f16vec2) == 4");
 static_assert(sizeof(vec2) == 8, "sizeof(vec2) == 8");
 static_assert(sizeof(dvec2) == 16, "sizeof(dvec2) == 16");

@@ -542,7 +541,6 @@ static_assert(sizeof(i16vec3) == 6, "sizeof(i16vec3) == 6");
 static_assert(sizeof(ivec3) == 12, "sizeof(ivec3) == 12");
 static_assert(sizeof(i64vec3) == 24, "sizeof(i64vec3) == 24");

 static_assert(sizeof(f16vec3) == 6, "sizeof(f16vec3) == 6");
 static_assert(sizeof(vec3) == 12, "sizeof(vec3) == 12");
 static_assert(sizeof(dvec3) == 24, "sizeof(dvec3) == 24");

@@ -975,7 +973,6 @@ static_assert(sizeof(i16vec4) == 8, "sizeof(i16vec4) == 8");
 static_assert(sizeof(ivec4) == 16, "sizeof(ivec4) == 16");
 static_assert(sizeof(i64vec4) == 32, "sizeof(i64vec4) == 32");

 static_assert(sizeof(f16vec4) == 8, "sizeof(f16vec4) == 8");
 static_assert(sizeof(vec4) == 16, "sizeof(vec4) == 16");
 static_assert(sizeof(dvec4) == 32, "sizeof(dvec4) == 32");

--- a/legacy/math/half.cpp
+++ b/legacy/math/half.cpp
@@ -1,248 +0,0 @@
 //
 //  Lol Engine
 //
 //  Copyright © 2010—2019 Sam Hocevar <sam@hocevar.net>
 //
 //  Lol Engine is free software. It comes without any warranty, to
 //  the extent permitted by applicable law. You can redistribute it
 //  and/or modify it under the terms of the Do What the Fuck You Want
 //  to Public License, Version 2, as published by the WTFPL Task Force.
 //  See http://www.wtfpl.net/ for more details.
 //

 #include <lol/engine-internal.h>

 namespace lol
 {

 /* These macros implement a finite iterator useful to build lookup
 * tables. For instance, S64(0) will call S1(x) for all values of x
 * between 0 and 63.
 * Due to the exponential behaviour of the calls, the stress on the
 * compiler may be important. */
 #define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
 #define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
 #define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
 #define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
 #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)

 /* Lookup table-based algorithm from “Fast Half Float Conversions”
 * by Jeroen van der Zijp, November 2008. No rounding is performed,
 * and some NaN values may be incorrectly converted to Inf (because
 * the lowest order bits in the mantissa are ignored). */
 static inline uint16_t float_to_half_nobranch(uint32_t x)
 {
    static uint16_t const basetable[512] =
    {
 #define S1(i) (((i) < 103) ? 0x0000u : \
               ((i) < 113) ? 0x0400u >> (0x1f & (113 - (i))) : \
               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00u)
        S256(0),
 #undef S1
 #define S1(i) (uint16_t)(0x8000u | basetable[i])
        S256(0),
 #undef S1
    };

    static uint8_t const shifttable[512] =
    {
 #define S1(i) (((i) < 103) ? 24 : \
               ((i) < 113) ? 126 - (i) : \
               ((i) < 143 || (i) == 255) ? 13 : 24)
        S256(0), S256(0),
 #undef S1
    };

    uint16_t bits = basetable[(x >> 23) & 0x1ff];
    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
    return bits;
 }

 /* This method is faster than the OpenEXR implementation (very often
 * used, eg. in Ogre), with the additional benefit of rounding, inspired
 * by James Tursa’s half-precision code. */
 static inline uint16_t float_to_half_branch(uint32_t x)
 {
    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */

    /* If zero, or denormal, or exponent underflows too much for a denormal
     * half, return signed zero. */
    if (e < 103)
        return bits;

    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
    if (e > 142)
    {
        bits |= 0x7c00u;
        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
         * not Inf, so make sure we set one mantissa bit too. */
        bits |= e == 255 && (x & 0x007fffffu);
        return bits;
    }

    /* If exponent underflows but not too much, return a denormal */
    if (e < 113)
    {
        m |= 0x0800u;
        /* Extra rounding may overflow and set mantissa to 0 and exponent
         * to 1, which is OK. */
        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
        return bits;
    }

    bits |= ((e - 112) << 10) | (m >> 1);
    /* Extra rounding. An overflow will set mantissa to 0 and increment
     * the exponent, which is OK. */
    bits += m & 1;
    return bits;
 }

 /* We use this magic table, inspired by De Bruijn sequences, to compute a
 * branchless integer log2. The actual value fetched is 24-log2(x+1) for x
 * in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. See
 * http://lolengine.net/blog/2012/04/03/beyond-de-bruijn for an explanation
 * of how the value 0x5a1a1a2u was obtained. */
 static uint32_t const shifttable[16] =
 {
    23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0,
 };
 static uint32_t const shiftmagic = 0x5a1a1a2u;

 /* Lookup table-based algorithm from “Fast Half Float Conversions”
 * by Jeroen van der Zijp, November 2008. Tables are generated using
 * the C++ preprocessor, thanks to a branchless implementation also
 * used in half_to_float_branch(). This code is very fast when performing
 * conversions on arrays of values. */
 static inline uint32_t half_to_float_nobranch(uint16_t x)
 {
 #define M3(i) ((i) | ((i) >> 1))
 #define M7(i) (M3(i) | (M3(i) >> 2))
 #define MF(i) (M7(i) | (M7(i) >> 4))
 #define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28]

    static uint32_t const mantissatable[2048] =
    {
 #define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
        S1024(0),
 #undef S1
 #define S1(i) (0x38000000u + ((i) << 13))
        S1024(0),
 #undef S1
    };

    static uint32_t const exponenttable[64] =
    {
 #define S1(i) (((i) == 0) ? 0 : \
               ((i) < 31) ? ((uint32_t)(i) << 23) : \
               ((i) == 31) ? 0x47800000u : \
               ((i) == 32) ? 0x80000000u : \
               ((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000)
        S64(0),
 #undef S1
    };

    static int const offsettable[64] =
    {
 #define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
        S64(0),
 #undef S1
    };

    return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
            + exponenttable[x >> 10];
 }

 /* This algorithm is similar to the OpenEXR implementation, except it
 * uses branchless code in the denormal path. This is slower than the
 * table version, but will be more friendly to the cache for occasional
 * uses. */
 static inline uint32_t half_to_float_branch(uint16_t x)
 {
    uint32_t s = (x & 0x8000u) << 16;

    if ((x & 0x7fffu) == 0)
        return (uint32_t)x << 16;

    uint32_t e = x & 0x7c00u;
    uint32_t m = x & 0x03ffu;

    if (e == 0)
    {
        /* m has 10 significant bits but replicating the leading bit to
         * 8 positions instead of 16 works just as well because of our
         * handcrafted shiftmagic table. */
        uint32_t v = m | (m >> 1);
        v |= v >> 2;
        v |= v >> 4;

        e = shifttable[(v * shiftmagic) >> 28];

        /* We don't have to remove the 10th mantissa bit because it gets
         * added to our underestimated exponent. */
        return s | (((125 - e) << 23) + (m << e));
    }

    if (e == 0x7c00u)
    {
        /* The amd64 pipeline likes the if() better than a ternary operator
         * or any other trick I could find. --sam */
        if (m == 0)
            return s | 0x7f800000u;
        return s | 0x7fc00000u;
    }

    return s | (((e >> 10) + 112) << 23) | (m << 13);
 }

 /* Constructor from float. Uses the non-branching version because benchmarks
 * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
 * penalty of loading the lookup tables does not seem important. */
 half half::makefast(float f)
 {
    union { float f; uint32_t x; } u = { f };
    return makebits(float_to_half_nobranch(u.x));
 }

 /* Constructor from float with better precision. */
 half half::makeaccurate(float f)
 {
    union { float f; uint32_t x; } u = { f };
    return makebits(float_to_half_branch(u.x));
 }

 /* Cast to float. Uses the branching version because loading the tables
 * for only one value is going to be cache-expensive. */
 half::operator float() const
 {
    union { float f; uint32_t x; } u;
    u.x = half_to_float_branch(bits);
    return u.f;
 }

 void half::convert(half *dst, float const *src, size_t nelem)
 {
    for (size_t i = 0; i < nelem; i++)
    {
        union { float f; uint32_t x; } u;
        u.f = *src++;
        *dst++ = makebits(float_to_half_nobranch(u.x));
    }
 }

 void half::convert(float *dst, half const *src, size_t nelem)
 {
    for (size_t i = 0; i < nelem; i++)
    {
        union { float f; uint32_t x; } u;

        /* This code is really too slow on the PS3, even with the denormal
         * handling stripped off. */
        u.x = half_to_float_nobranch((*src++).bits);
        *dst++ = u.f;
    }
 }

 } /* namespace lol */