diff --git a/src/half.cpp b/src/half.cpp index 5e99dd16..0f2109db 100644 --- a/src/half.cpp +++ b/src/half.cpp @@ -12,6 +12,10 @@ # include "config.h" #endif +#if defined __CELLOS_LV2__ +# include +#endif + #include "core.h" using namespace std; @@ -37,13 +41,13 @@ static inline uint16_t float_to_half_nobranch(uint32_t x) { static uint16_t const basetable[512] = { -#define S1(i) (((i) < 103) ? 0x0000: \ +#define S1(i) (((i) < 103) ? 0x0000 : \ ((i) < 113) ? 0x0400 >> (113 - (i)) : \ ((i) < 143) ? ((i) - 112) << 10 : 0x7c00) S256(0), #undef S1 #define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \ - ((i) < 113) ? 0x0400 >> (113 - (i)): \ + ((i) < 113) ? 0x0400 >> (113 - (i)) : \ ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)) S256(0), #undef S1 @@ -72,16 +76,10 @@ static inline uint16_t float_to_half_branch(uint32_t x) uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */ unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ - /* If zero, or denormal, or exponent underflows too much for a denormal, - * return signed zero. */ -#if !defined __CELLOS_LV2__ + /* If zero, or denormal, or exponent underflows too much for a denormal + * half, return signed zero. */ if (e < 103) return bits; -#else - /* PS3 don't know bout my denormals */ - if (e < 113) - return bits; -#endif /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ if (e > 142) @@ -93,7 +91,6 @@ static inline uint16_t float_to_half_branch(uint32_t x) return bits; } -#if !defined __CELLOS_LV2__ /* If exponent underflows but not too much, return a denormal */ if (e < 113) { @@ -103,7 +100,6 @@ static inline uint16_t float_to_half_branch(uint32_t x) bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); return bits; } -#endif bits |= ((e - 112) << 10) | (m >> 1); /* Extra rounding. An overflow will set mantissa to 0 and increment @@ -112,6 +108,53 @@ static inline uint16_t float_to_half_branch(uint32_t x) return bits; } +#if 0 +static inline void float_to_half_vector(half *dst, float const *src) +{ + vector unsigned int const v7 = vec_splat_u32(7); + vector unsigned short const v6 = vec_splat_u16(6); +#if _XBOX + vector signed short const v9 = vec_splat_u16(9); + vector unsigned short const v10 = vec_splat_u16(10); +#else + vector signed short const v0x0040 = { + 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040}; + vector unsigned short const v0x0400 = { + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; +#endif + vector unsigned char const shuffle_high = { + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + vector unsigned char const shuffle_low = { + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; + vector unsigned char const v0xbf70 = { + 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, + 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70}; + + vector unsigned short v_mant, v_ret; + vector signed short v_exp; + vector unsigned int in0 = (vector unsigned int)vec_ld(0, src); + vector unsigned int in1 = (vector unsigned int)vec_ld(16, src); + + in0 = vec_sra(in0, v7); + in1 = vec_sra(in1, v7); + v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high); + v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low); + v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70); +#if _XBOX + v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9)); +#else + v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp); +#endif + v_mant = vec_sr(v_mant, v6); +#if _XBOX + v_ret = vec_or(v_mant, vec_sl(v_ret, v10)); +#else + v_ret = vec_mladd(v_ret, v0x0400, v_mant); +#endif + vec_st(v_ret, 0, (uint16_t *)dst); +} +#endif + static int const shifttable[32] = { 23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0, @@ -210,18 +253,12 @@ static inline uint32_t half_to_float_branch(uint16_t x) } /* Constructor from float. Uses the non-branching version because benchmarks - * indicate it is always twice as fast. The penalty of loading the lookup - * tables does not seem important. */ + * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The + * penalty of loading the lookup tables does not seem important. */ half half::makefast(float f) { union { float f; uint32_t x; } u = { f }; -#if !defined __CELLOS_LV2__ return makebits(float_to_half_nobranch(u.x)); -#else - /* This code is slightly faster on the PS3, mostly because we - * don't need to care about denormals. */ - return makebits(float_to_half_branch(u.x)); -#endif } /* Constructor from float with better precision. */ @@ -233,12 +270,10 @@ half half::makeaccurate(float f) /* Cast to float. Uses the branching version because loading the tables * for only one value is going to be cache-expensive. */ -half::operator float() const +float half::tofloat(half h) { - /* FIXME: there is a hidden "this" in this method. Export more - * code so that it can all work in registers instead. */ union { float f; uint32_t x; } u; - u.x = half_to_float_branch(bits); + u.x = half_to_float_branch(h.bits); return u.f; } @@ -248,12 +283,13 @@ size_t half::convert(half *dst, float const *src, size_t nelem) { union { float f; uint32_t x; } u; u.f = *src++; -#if !defined __CELLOS_LV2__ *dst++ = makebits(float_to_half_nobranch(u.x)); -#else - /* This code is slightly faster on the PS3, mostly because we - * don't need to care about denormals. */ - *dst++ = makebits(float_to_half_branch(u.x)); +#if 0 + /* AltiVec code. Will work one day. */ + float_to_half_vector(dst, src); + src += 8; + dst += 8; + i += 7; #endif } diff --git a/src/half.h b/src/half.h index 2377845c..10f9ec54 100644 --- a/src/half.h +++ b/src/half.h @@ -51,8 +51,10 @@ public: } /* Cast to other types */ - operator float() const; - inline operator int() const { return (int)(float)*this; } + inline operator float() const { return tofloat(*this); } + inline operator int() const { return (int)tofloat(*this); } + + static float tofloat(half h); /* Array conversions */ static size_t convert(half *dst, float const *src, size_t nelem);