diff --git a/Makefile.am b/Makefile.am index 880ea64c..8616d245 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,3 +5,7 @@ DIST_SUBDIRS = $(SUBDIRS) EXTRA_DIST = bootstrap build-linux build-mingw AUTOMAKE_OPTIONS = dist-bzip2 +bench: + cd test && $(MAKE) $(AM_MAKEFLAGS) $@ +.PHONY: bench + diff --git a/src/half.cpp b/src/half.cpp index 36065678..dcbdf822 100644 --- a/src/half.cpp +++ b/src/half.cpp @@ -196,26 +196,34 @@ static inline uint32_t half_to_float_branch(uint16_t x) return s | (((e >> 10) + 112) << 23) | (m << 13); } +/* Constructor from float. Uses the non-branching version because benchmarks + * indicate it is always twice as fast. The penalty of loading the lookup + * tables does not seem important. */ half half::makefast(float f) { union { float f; uint32_t x; } u = { f }; return makebits(float_to_half_nobranch(u.x)); } +/* Constructor from float with better precision. */ half half::makeslow(float f) { union { float f; uint32_t x; } u = { f }; return makebits(float_to_half_branch(u.x)); } +/* Cast to float. Uses the branching version because loading the tables + * for only one value is going to be cache-expensive. */ half::operator float() const { + /* FIXME: there is a hidden "this" in this method. Export more + * code so that it can all work in registers instead. */ union { float f; uint32_t x; } u; u.x = half_to_float_branch(bits); return u.f; } -size_t half::copy(half *dst, float const *src, size_t nelem) +size_t half::convert(half *dst, float const *src, size_t nelem) { for (size_t i = 0; i < nelem; i++) { @@ -227,7 +235,7 @@ size_t half::copy(half *dst, float const *src, size_t nelem) return nelem; } -size_t half::copy(float *dst, half const *src, size_t nelem) +size_t half::convert(float *dst, half const *src, size_t nelem) { for (size_t i = 0; i < nelem; i++) { diff --git a/src/half.h b/src/half.h index 3105ec45..098714b0 100644 --- a/src/half.h +++ b/src/half.h @@ -25,12 +25,10 @@ namespace lol class half { public: + /* Constructors. Always inline so that the code can work in registers + * instead of calling routines with the hidden "this" parameter. */ inline half() { } - - inline half(float f) - { - *this = makefast(f); - } + inline half(float f) { *this = makefast(f); } inline int is_nan() const { @@ -57,8 +55,8 @@ public: inline operator int() const { return (int)(float)*this; } /* Array conversions */ - static size_t copy(half *dst, float const *src, size_t nelem); - static size_t copy(float *dst, half const *src, size_t nelem); + static size_t convert(half *dst, float const *src, size_t nelem); + static size_t convert(float *dst, half const *src, size_t nelem); /* Operations */ inline half operator -() { return makebits(bits ^ 0x8000u); } diff --git a/test/half.cpp b/test/half.cpp index 065102a7..d7c7e864 100644 --- a/test/half.cpp +++ b/test/half.cpp @@ -33,9 +33,9 @@ namespace lol class HalfTest : public CppUnit::TestCase { CPPUNIT_TEST_SUITE(HalfTest); - CPPUNIT_TEST(test_half_makebits); + CPPUNIT_TEST(test_half_from_float); CPPUNIT_TEST(test_half_makeslow); - CPPUNIT_TEST(test_half_makefast); + CPPUNIT_TEST(test_half_makebits); CPPUNIT_TEST(test_half_is_nan); CPPUNIT_TEST(test_half_is_inf); CPPUNIT_TEST(test_half_is_finite); @@ -56,12 +56,12 @@ public: void tearDown() {} - void test_half_makebits() + void test_half_from_float() { - for (unsigned int i = 0; i < 0x10000; i++) + for (size_t i = 0; i < sizeof(pairs) / sizeof(*pairs); i++) { - half a = half::makebits(i); - uint16_t b = i; + half a = (half)pairs[i].f; + uint16_t b = pairs[i].x; CPPUNIT_ASSERT_EQUAL(a.bits, b); } } @@ -76,12 +76,12 @@ public: } } - void test_half_makefast() + void test_half_makebits() { - for (size_t i = 0; i < sizeof(pairs) / sizeof(*pairs); i++) + for (unsigned int i = 0; i < 0x10000; i++) { - half a = half::makefast(pairs[i].f); - uint16_t b = pairs[i].x; + half a = half::makebits(i); + uint16_t b = i; CPPUNIT_ASSERT_EQUAL(a.bits, b); } } diff --git a/test/lol-bench.cpp b/test/lol-bench.cpp index d64cc77b..5c691207 100644 --- a/test/lol-bench.cpp +++ b/test/lol-bench.cpp @@ -12,48 +12,118 @@ # include "config.h" #endif +#include + #include "core.h" #include "loldebug.h" using namespace std; using namespace lol; +static size_t const HALF_TABLE_SIZE = 1024 * 1024; +static size_t const HALF_RUNS = 100; + +static void bench_half(int mode); + int main(int argc, char **argv) { + Log::Info("-----------------------------------\n"); + Log::Info("Half precision floats (random bits)\n"); + Log::Info("-----------------------------------\n"); + bench_half(1); + + Log::Info("---------------------------------\n"); + Log::Info("Half precision floats [-2.0, 2.0]\n"); + Log::Info("---------------------------------\n"); + bench_half(2); + + return EXIT_SUCCESS; +} + +static void bench_half(int mode) +{ + float result[8] = { 0.0f }; Timer timer; - float ftotal = 0.0f; - for (uint32_t i = 0; i < 0xffffffffu; i += 7) - { - union { float f; uint32_t x; } u; - u.x = i; + /* Set up tables */ + float *pf = new float[HALF_TABLE_SIZE]; + half *ph = new half[HALF_TABLE_SIZE]; - float h = (float)half::makefast(u.f); - ftotal += h; + switch (mode) + { + case 1: + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] = half::makebits(rand()); + break; + case 2: + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] = RandF(-2.0f, 2.0f); + break; } - Log::Info("time for makeslow: %f (hash %f)\n", timer.GetMs(), ftotal); - uint16_t total = 0; - for (uint32_t i = 0; i < 0xffffffffu; i += 7) + for (size_t run = 0; run < HALF_RUNS; run++) { - union { float f; uint32_t x; } u; - u.x = i; + /* Convert half to float (array) */ + timer.GetMs(); + half::convert(pf, ph, HALF_TABLE_SIZE); + result[1] += timer.GetMs(); - half h = half::makeslow(u.f); - total ^= h.bits; - } - Log::Info("time for makeslow: %f (hash %04x)\n", timer.GetMs(), total); + /* Convert half to float (fast) */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + pf[i] = (float)ph[i]; + result[0] += timer.GetMs(); - for (uint32_t i = 0; i < 0xffffffffu; i += 7) - { - union { float f; uint32_t x; } u; - u.x = i; + /* Convert float to half (array) */ + timer.GetMs(); + half::convert(ph, pf, HALF_TABLE_SIZE); + result[4] += timer.GetMs(); + + /* Convert float to half (fast) */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] = (half)pf[i]; + result[2] += timer.GetMs(); + + /* Convert float to half (slow) */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] = half::makeslow(pf[i]); + result[3] += timer.GetMs(); - half h = half::makefast(u.f); - total ^= h.bits; + /* Change sign of every half */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] = -ph[i]; + result[5] += timer.GetMs(); + + /* Add a half to every float */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + pf[i] += ph[i]; + result[6] += timer.GetMs(); + + /* Add a float to every half */ + timer.GetMs(); + for (size_t i = 0; i < HALF_TABLE_SIZE; i++) + ph[i] += pf[i]; + result[7] += timer.GetMs(); } - Log::Info("time for makefast: %f (hash %04x)\n", timer.GetMs(), total); - return EXIT_SUCCESS; + delete[]pf; + delete[]ph; + + for (size_t i = 0; i < sizeof(result) / sizeof(*result); i++) + result[i] *= 1000000.0f / (HALF_TABLE_SIZE * HALF_RUNS); + + Log::Info(" ns/elem\n"); + Log::Info("float = half %7.3f\n", result[0]); + Log::Info("float[] = half[] %7.3f\n", result[1]); + Log::Info("half = float %7.3f\n", result[2]); + Log::Info("half = makeslow(float) %7.3f\n", result[3]); + Log::Info("half[] = float[] %7.3f\n", result[4]); + Log::Info("half = -half %7.3f\n", result[5]); + Log::Info("float += half %7.3f\n", result[6]); + Log::Info("half += float %7.3f\n", result[7]); }