From 4b9bd5874733c2fbf8b171e1376c7450b17cfb3a Mon Sep 17 00:00:00 2001
From: Sam Hocevar <sam@hocevar.net>
Date: Tue, 30 Aug 2011 17:19:08 +0000
Subject: [PATCH] core: reactivate half denormals for the PS3.

We know we will not have denormal floats on the PS3, but we should still
create denormal halves in case the other end (maybe the GPU?) knows how
to handle them.
---
 src/half.cpp | 94 ++++++++++++++++++++++++++++++++++++----------------
 src/half.h   |  6 ++--
 2 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/src/half.cpp b/src/half.cpp
index 5e99dd16..0f2109db 100644
--- a/src/half.cpp
+++ b/src/half.cpp
@@ -12,6 +12,10 @@
 #   include "config.h"
 #endif
 
+#if defined __CELLOS_LV2__
+#   include <ppu_altivec_internals.h>
+#endif
+
 #include "core.h"
 
 using namespace std;
@@ -37,13 +41,13 @@ static inline uint16_t float_to_half_nobranch(uint32_t x)
 {
     static uint16_t const basetable[512] =
     {
-#define S1(i) (((i) < 103) ? 0x0000: \
+#define S1(i) (((i) < 103) ? 0x0000 : \
                ((i) < 113) ? 0x0400 >> (113 - (i)) : \
                ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
         S256(0),
 #undef S1
 #define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
-                         ((i) < 113) ? 0x0400 >> (113 - (i)): \
+                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
                          ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
         S256(0),
 #undef S1
@@ -72,16 +76,10 @@ static inline uint16_t float_to_half_branch(uint32_t x)
     uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
     unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
 
-    /* If zero, or denormal, or exponent underflows too much for a denormal,
-     * return signed zero. */
-#if !defined __CELLOS_LV2__
+    /* If zero, or denormal, or exponent underflows too much for a denormal
+     * half, return signed zero. */
     if (e < 103)
         return bits;
-#else
-    /* PS3 don't know bout my denormals */
-    if (e < 113)
-        return bits;
-#endif
 
     /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
     if (e > 142)
@@ -93,7 +91,6 @@ static inline uint16_t float_to_half_branch(uint32_t x)
         return bits;
     }
 
-#if !defined __CELLOS_LV2__
     /* If exponent underflows but not too much, return a denormal */
     if (e < 113)
     {
@@ -103,7 +100,6 @@ static inline uint16_t float_to_half_branch(uint32_t x)
         bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
         return bits;
     }
-#endif
 
     bits |= ((e - 112) << 10) | (m >> 1);
     /* Extra rounding. An overflow will set mantissa to 0 and increment
@@ -112,6 +108,53 @@ static inline uint16_t float_to_half_branch(uint32_t x)
     return bits;
 }
 
+#if 0
+static inline void float_to_half_vector(half *dst, float const *src)
+{
+    vector unsigned int const v7 = vec_splat_u32(7);
+    vector unsigned short const v6 = vec_splat_u16(6);
+#if _XBOX
+    vector signed short const v9 = vec_splat_u16(9);
+    vector unsigned short const v10 = vec_splat_u16(10);
+#else
+    vector signed short const v0x0040 = {
+        0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
+    vector unsigned short const v0x0400 = {
+        0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+#endif
+    vector unsigned char const shuffle_high = {
+        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+    vector unsigned char const shuffle_low = {
+        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
+    vector unsigned char const v0xbf70 = {
+        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
+        0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
+
+    vector unsigned short v_mant, v_ret;
+    vector signed short v_exp;
+    vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
+    vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
+
+    in0 = vec_sra(in0, v7);
+    in1 = vec_sra(in1, v7);
+    v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
+    v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
+    v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
+#if _XBOX
+    v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
+#else
+    v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
+#endif
+    v_mant = vec_sr(v_mant, v6);
+#if _XBOX
+    v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
+#else
+    v_ret = vec_mladd(v_ret, v0x0400, v_mant);
+#endif
+    vec_st(v_ret, 0, (uint16_t *)dst);
+}
+#endif
+
 static int const shifttable[32] =
 {
     23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
@@ -210,18 +253,12 @@ static inline uint32_t half_to_float_branch(uint16_t x)
 }
 
 /* Constructor from float. Uses the non-branching version because benchmarks
- * indicate it is always twice as fast. The penalty of loading the lookup
- * tables does not seem important. */
+ * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
+ * penalty of loading the lookup tables does not seem important. */
 half half::makefast(float f)
 {
     union { float f; uint32_t x; } u = { f };
-#if !defined __CELLOS_LV2__
     return makebits(float_to_half_nobranch(u.x));
-#else
-    /* This code is slightly faster on the PS3, mostly because we
-     * don't need to care about denormals. */
-    return makebits(float_to_half_branch(u.x));
-#endif
 }
 
 /* Constructor from float with better precision. */
@@ -233,12 +270,10 @@ half half::makeaccurate(float f)
 
 /* Cast to float. Uses the branching version because loading the tables
  * for only one value is going to be cache-expensive. */
-half::operator float() const
+float half::tofloat(half h)
 {
-    /* FIXME: there is a hidden "this" in this method. Export more
-     * code so that it can all work in registers instead. */
     union { float f; uint32_t x; } u;
-    u.x = half_to_float_branch(bits);
+    u.x = half_to_float_branch(h.bits);
     return u.f;
 }
 
@@ -248,12 +283,13 @@ size_t half::convert(half *dst, float const *src, size_t nelem)
     {
         union { float f; uint32_t x; } u;
         u.f = *src++;
-#if !defined __CELLOS_LV2__
         *dst++ = makebits(float_to_half_nobranch(u.x));
-#else
-        /* This code is slightly faster on the PS3, mostly because we
-         * don't need to care about denormals. */
-        *dst++ = makebits(float_to_half_branch(u.x));
+#if 0
+        /* AltiVec code. Will work one day. */
+        float_to_half_vector(dst, src);
+        src += 8;
+        dst += 8;
+        i += 7;
 #endif
     }
 
diff --git a/src/half.h b/src/half.h
index 2377845c..10f9ec54 100644
--- a/src/half.h
+++ b/src/half.h
@@ -51,8 +51,10 @@ public:
     }
 
     /* Cast to other types */
-    operator float() const;
-    inline operator int() const { return (int)(float)*this; }
+    inline operator float() const { return tofloat(*this); }
+    inline operator int() const { return (int)tofloat(*this); }
+
+    static float tofloat(half h);
 
     /* Array conversions */
     static size_t convert(half *dst, float const *src, size_t nelem);