Browse Source

core: tune the half precision code so that the best variants are being

used on the PS3 platform.
legacy
Sam Hocevar sam 13 years ago
parent
commit
cb001ddbcc
4 changed files with 52 additions and 21 deletions
  1. +32
    -1
      src/half.cpp
  2. +1
    -1
      src/half.h
  3. +3
    -3
      test/half.cpp
  4. +16
    -16
      test/lol-bench.cpp

+ 32
- 1
src/half.cpp View File

@@ -74,8 +74,14 @@ static inline uint16_t float_to_half_branch(uint32_t x)

/* If zero, or denormal, or exponent underflows too much for a denormal,
* return signed zero. */
#if !defined __CELLOS_LV2__
if (e < 103)
return bits;
#else
/* PS3 don't know bout my denormals */
if (e < 113)
return bits;
#endif

/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142)
@@ -87,6 +93,7 @@ static inline uint16_t float_to_half_branch(uint32_t x)
return bits;
}

#if !defined __CELLOS_LV2__
/* If exponent underflows but not too much, return a denormal */
if (e < 113)
{
@@ -96,6 +103,7 @@ static inline uint16_t float_to_half_branch(uint32_t x)
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return bits;
}
#endif

bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment
@@ -172,6 +180,7 @@ static inline uint32_t half_to_float_branch(uint16_t x)

if (e == 0)
{
#if !defined __CELLOS_LV2__
uint32_t v = m | (m >> 1);
v |= v >> 2;
v |= v >> 4;
@@ -182,6 +191,10 @@ static inline uint32_t half_to_float_branch(uint16_t x)
/* We don't have to remove the 10th mantissa bit because it gets
* added to our underestimated exponent. */
return s | (((125 - e) << 23) + (m << e));
#else
/* PS3 don't know bout my denormals */
return s;
#endif
}

if (e == 0x7c00u)
@@ -202,11 +215,17 @@ static inline uint32_t half_to_float_branch(uint16_t x)
half half::makefast(float f)
{
union { float f; uint32_t x; } u = { f };
#if !defined __CELLOS_LV2__
return makebits(float_to_half_nobranch(u.x));
#else
/* This code is slightly faster on the PS3, mostly because we
* don't need to care about denormals. */
return makebits(float_to_half_branch(u.x));
#endif
}

/* Constructor from float with better precision. */
half half::makeslow(float f)
half half::makeaccurate(float f)
{
union { float f; uint32_t x; } u = { f };
return makebits(float_to_half_branch(u.x));
@@ -229,7 +248,13 @@ size_t half::convert(half *dst, float const *src, size_t nelem)
{
union { float f; uint32_t x; } u;
u.f = *src++;
#if !defined __CELLOS_LV2__
*dst++ = makebits(float_to_half_nobranch(u.x));
#else
/* This code is slightly faster on the PS3, mostly because we
* don't need to care about denormals. */
*dst++ = makebits(float_to_half_branch(u.x));
#endif
}

return nelem;
@@ -240,7 +265,13 @@ size_t half::convert(float *dst, half const *src, size_t nelem)
for (size_t i = 0; i < nelem; i++)
{
union { float f; uint32_t x; } u;
#if !defined __CELLOS_LV2__
/* This code is really too slow on the PS3, even with the denormal
* handling stripped off. */
u.x = half_to_float_nobranch((*src++).bits);
#else
u.x = half_to_float_branch((*src++).bits);
#endif
*dst++ = u.f;
}



+ 1
- 1
src/half.h View File

@@ -79,8 +79,8 @@ public:
inline float operator /(half h) const { return (float)*this / (float)h; }

/* Factories */
static half makeslow(float f);
static half makefast(float f);
static half makeaccurate(float f);
static inline half makebits(uint16_t x)
{
half ret;


+ 3
- 3
test/half.cpp View File

@@ -34,7 +34,7 @@ class HalfTest : public CppUnit::TestCase
{
CPPUNIT_TEST_SUITE(HalfTest);
CPPUNIT_TEST(test_half_from_float);
CPPUNIT_TEST(test_half_makeslow);
CPPUNIT_TEST(test_half_makeaccurate);
CPPUNIT_TEST(test_half_makebits);
CPPUNIT_TEST(test_half_is_nan);
CPPUNIT_TEST(test_half_is_inf);
@@ -66,11 +66,11 @@ public:
}
}

void test_half_makeslow()
void test_half_makeaccurate()
{
for (size_t i = 0; i < sizeof(pairs) / sizeof(*pairs); i++)
{
half a = half::makeslow(pairs[i].f);
half a = half::makeaccurate(pairs[i].f);
uint16_t b = pairs[i].x;
CPPUNIT_ASSERT_EQUAL(a.bits, b);
}


+ 16
- 16
test/lol-bench.cpp View File

@@ -66,30 +66,30 @@ static void bench_half(int mode)
/* Convert half to float (array) */
timer.GetMs();
half::convert(pf, ph, HALF_TABLE_SIZE);
result[1] += timer.GetMs();
result[0] += timer.GetMs();

/* Convert half to float (fast) */
timer.GetMs();
for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
pf[i] = (float)ph[i];
result[0] += timer.GetMs();
result[1] += timer.GetMs();

/* Convert float to half (array) */
timer.GetMs();
half::convert(ph, pf, HALF_TABLE_SIZE);
result[4] += timer.GetMs();
result[2] += timer.GetMs();

/* Convert float to half (fast) */
timer.GetMs();
for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
ph[i] = (half)pf[i];
result[2] += timer.GetMs();
result[3] += timer.GetMs();

/* Convert float to half (slow) */
/* Convert float to half (accurate) */
timer.GetMs();
for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
ph[i] = half::makeslow(pf[i]);
result[3] += timer.GetMs();
ph[i] = half::makeaccurate(pf[i]);
result[4] += timer.GetMs();

/* Change sign of every half */
timer.GetMs();
@@ -116,14 +116,14 @@ static void bench_half(int mode)
for (size_t i = 0; i < sizeof(result) / sizeof(*result); i++)
result[i] *= 1000000.0f / (HALF_TABLE_SIZE * HALF_RUNS);

Log::Info(" ns/elem\n");
Log::Info("float = half %7.3f\n", result[0]);
Log::Info("float[] = half[] %7.3f\n", result[1]);
Log::Info("half = float %7.3f\n", result[2]);
Log::Info("half = makeslow(float) %7.3f\n", result[3]);
Log::Info("half[] = float[] %7.3f\n", result[4]);
Log::Info("half = -half %7.3f\n", result[5]);
Log::Info("float += half %7.3f\n", result[6]);
Log::Info("half += float %7.3f\n", result[7]);
Log::Info(" ns/elem\n");
Log::Info("float[] = half[] %7.3f\n", result[0]);
Log::Info("float = half %7.3f\n", result[1]);
Log::Info("half[] = float[] %7.3f\n", result[2]);
Log::Info("half = float (fast) %7.3f\n", result[3]);
Log::Info("half = float (accurate) %7.3f\n", result[4]);
Log::Info("half = -half %7.3f\n", result[5]);
Log::Info("float += half %7.3f\n", result[6]);
Log::Info("half += float %7.3f\n", result[7]);
}


Loading…
Cancel
Save