瀏覽代碼

core: reactivate half denormals for the PS3.

We know we will not have denormal floats on the PS3, but we should still
create denormal halves in case the other end (maybe the GPU?) knows how
to handle them.
legacy
Sam Hocevar sam 13 年之前
父節點
當前提交
4b9bd58747
共有 2 個檔案被更改,包括 69 行新增31 行删除
  1. +65
    -29
      src/half.cpp
  2. +4
    -2
      src/half.h

+ 65
- 29
src/half.cpp 查看文件

@@ -12,6 +12,10 @@
# include "config.h"
#endif

#if defined __CELLOS_LV2__
# include <ppu_altivec_internals.h>
#endif

#include "core.h"

using namespace std;
@@ -37,13 +41,13 @@ static inline uint16_t float_to_half_nobranch(uint32_t x)
{
static uint16_t const basetable[512] =
{
#define S1(i) (((i) < 103) ? 0x0000: \
#define S1(i) (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
S256(0),
#undef S1
#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)): \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
S256(0),
#undef S1
@@ -72,16 +76,10 @@ static inline uint16_t float_to_half_branch(uint32_t x)
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */

/* If zero, or denormal, or exponent underflows too much for a denormal,
* return signed zero. */
#if !defined __CELLOS_LV2__
/* If zero, or denormal, or exponent underflows too much for a denormal
* half, return signed zero. */
if (e < 103)
return bits;
#else
/* PS3 don't know bout my denormals */
if (e < 113)
return bits;
#endif

/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142)
@@ -93,7 +91,6 @@ static inline uint16_t float_to_half_branch(uint32_t x)
return bits;
}

#if !defined __CELLOS_LV2__
/* If exponent underflows but not too much, return a denormal */
if (e < 113)
{
@@ -103,7 +100,6 @@ static inline uint16_t float_to_half_branch(uint32_t x)
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return bits;
}
#endif

bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment
@@ -112,6 +108,53 @@ static inline uint16_t float_to_half_branch(uint32_t x)
return bits;
}

#if 0
static inline void float_to_half_vector(half *dst, float const *src)
{
vector unsigned int const v7 = vec_splat_u32(7);
vector unsigned short const v6 = vec_splat_u16(6);
#if _XBOX
vector signed short const v9 = vec_splat_u16(9);
vector unsigned short const v10 = vec_splat_u16(10);
#else
vector signed short const v0x0040 = {
0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
vector unsigned short const v0x0400 = {
0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
#endif
vector unsigned char const shuffle_high = {
0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
vector unsigned char const shuffle_low = {
2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
vector unsigned char const v0xbf70 = {
0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};

vector unsigned short v_mant, v_ret;
vector signed short v_exp;
vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);

in0 = vec_sra(in0, v7);
in1 = vec_sra(in1, v7);
v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
#if _XBOX
v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
#else
v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
#endif
v_mant = vec_sr(v_mant, v6);
#if _XBOX
v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
#else
v_ret = vec_mladd(v_ret, v0x0400, v_mant);
#endif
vec_st(v_ret, 0, (uint16_t *)dst);
}
#endif

static int const shifttable[32] =
{
23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
@@ -210,18 +253,12 @@ static inline uint32_t half_to_float_branch(uint16_t x)
}

/* Constructor from float. Uses the non-branching version because benchmarks
* indicate it is always twice as fast. The penalty of loading the lookup
* tables does not seem important. */
* indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
* penalty of loading the lookup tables does not seem important. */
half half::makefast(float f)
{
union { float f; uint32_t x; } u = { f };
#if !defined __CELLOS_LV2__
return makebits(float_to_half_nobranch(u.x));
#else
/* This code is slightly faster on the PS3, mostly because we
* don't need to care about denormals. */
return makebits(float_to_half_branch(u.x));
#endif
}

/* Constructor from float with better precision. */
@@ -233,12 +270,10 @@ half half::makeaccurate(float f)

/* Cast to float. Uses the branching version because loading the tables
* for only one value is going to be cache-expensive. */
half::operator float() const
float half::tofloat(half h)
{
/* FIXME: there is a hidden "this" in this method. Export more
* code so that it can all work in registers instead. */
union { float f; uint32_t x; } u;
u.x = half_to_float_branch(bits);
u.x = half_to_float_branch(h.bits);
return u.f;
}

@@ -248,12 +283,13 @@ size_t half::convert(half *dst, float const *src, size_t nelem)
{
union { float f; uint32_t x; } u;
u.f = *src++;
#if !defined __CELLOS_LV2__
*dst++ = makebits(float_to_half_nobranch(u.x));
#else
/* This code is slightly faster on the PS3, mostly because we
* don't need to care about denormals. */
*dst++ = makebits(float_to_half_branch(u.x));
#if 0
/* AltiVec code. Will work one day. */
float_to_half_vector(dst, src);
src += 8;
dst += 8;
i += 7;
#endif
}



+ 4
- 2
src/half.h 查看文件

@@ -51,8 +51,10 @@ public:
}

/* Cast to other types */
operator float() const;
inline operator int() const { return (int)(float)*this; }
inline operator float() const { return tofloat(*this); }
inline operator int() const { return (int)tofloat(*this); }

static float tofloat(half h);

/* Array conversions */
static size_t convert(half *dst, float const *src, size_t nelem);


Loading…
取消
儲存