diff --git a/Makefile.am b/Makefile.am
index 880ea64c..8616d245 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,3 +5,7 @@ DIST_SUBDIRS = $(SUBDIRS)
 EXTRA_DIST = bootstrap build-linux build-mingw
 AUTOMAKE_OPTIONS = dist-bzip2
 
+bench:
+	cd test && $(MAKE) $(AM_MAKEFLAGS) $@
+.PHONY: bench
+
diff --git a/src/half.cpp b/src/half.cpp
index 36065678..dcbdf822 100644
--- a/src/half.cpp
+++ b/src/half.cpp
@@ -196,26 +196,34 @@ static inline uint32_t half_to_float_branch(uint16_t x)
     return s | (((e >> 10) + 112) << 23) | (m << 13);
 }
 
+/* Constructor from float. Uses the non-branching version because benchmarks
+ * indicate it is always twice as fast. The penalty of loading the lookup
+ * tables does not seem important. */
 half half::makefast(float f)
 {
     union { float f; uint32_t x; } u = { f };
     return makebits(float_to_half_nobranch(u.x));
 }
 
+/* Constructor from float with better precision. */
 half half::makeslow(float f)
 {
     union { float f; uint32_t x; } u = { f };
     return makebits(float_to_half_branch(u.x));
 }
 
+/* Cast to float. Uses the branching version because loading the tables
+ * for only one value is going to be cache-expensive. */
 half::operator float() const
 {
+    /* FIXME: there is a hidden "this" in this method. Export more
+     * code so that it can all work in registers instead. */
     union { float f; uint32_t x; } u;
     u.x = half_to_float_branch(bits);
     return u.f;
 }
 
-size_t half::copy(half *dst, float const *src, size_t nelem)
+size_t half::convert(half *dst, float const *src, size_t nelem)
 {
     for (size_t i = 0; i < nelem; i++)
     {
@@ -227,7 +235,7 @@ size_t half::copy(half *dst, float const *src, size_t nelem)
     return nelem;
 }
 
-size_t half::copy(float *dst, half const *src, size_t nelem)
+size_t half::convert(float *dst, half const *src, size_t nelem)
 {
     for (size_t i = 0; i < nelem; i++)
     {
diff --git a/src/half.h b/src/half.h
index 3105ec45..098714b0 100644
--- a/src/half.h
+++ b/src/half.h
@@ -25,12 +25,10 @@ namespace lol
 class half
 {
 public:
+    /* Constructors. Always inline so that the code can work in registers
+     * instead of calling routines with the hidden "this" parameter. */
     inline half() { }
-
-    inline half(float f)
-    {
-        *this = makefast(f);
-    }
+    inline half(float f) { *this = makefast(f); }
 
     inline int is_nan() const
     {
@@ -57,8 +55,8 @@ public:
     inline operator int() const { return (int)(float)*this; }
 
     /* Array conversions */
-    static size_t copy(half *dst, float const *src, size_t nelem);
-    static size_t copy(float *dst, half const *src, size_t nelem);
+    static size_t convert(half *dst, float const *src, size_t nelem);
+    static size_t convert(float *dst, half const *src, size_t nelem);
 
     /* Operations */
     inline half operator -() { return makebits(bits ^ 0x8000u); }
diff --git a/test/half.cpp b/test/half.cpp
index 065102a7..d7c7e864 100644
--- a/test/half.cpp
+++ b/test/half.cpp
@@ -33,9 +33,9 @@ namespace lol
 class HalfTest : public CppUnit::TestCase
 {
     CPPUNIT_TEST_SUITE(HalfTest);
-    CPPUNIT_TEST(test_half_makebits);
+    CPPUNIT_TEST(test_half_from_float);
     CPPUNIT_TEST(test_half_makeslow);
-    CPPUNIT_TEST(test_half_makefast);
+    CPPUNIT_TEST(test_half_makebits);
     CPPUNIT_TEST(test_half_is_nan);
     CPPUNIT_TEST(test_half_is_inf);
     CPPUNIT_TEST(test_half_is_finite);
@@ -56,12 +56,12 @@ public:
 
     void tearDown() {}
 
-    void test_half_makebits()
+    void test_half_from_float()
     {
-        for (unsigned int i = 0; i < 0x10000; i++)
+        for (size_t i = 0; i < sizeof(pairs) / sizeof(*pairs); i++)
         {
-            half a = half::makebits(i);
-            uint16_t b = i;
+            half a = (half)pairs[i].f;
+            uint16_t b = pairs[i].x;
             CPPUNIT_ASSERT_EQUAL(a.bits, b);
         }
     }
@@ -76,12 +76,12 @@ public:
         }
     }
 
-    void test_half_makefast()
+    void test_half_makebits()
     {
-        for (size_t i = 0; i < sizeof(pairs) / sizeof(*pairs); i++)
+        for (unsigned int i = 0; i < 0x10000; i++)
         {
-            half a = half::makefast(pairs[i].f);
-            uint16_t b = pairs[i].x;
+            half a = half::makebits(i);
+            uint16_t b = i;
             CPPUNIT_ASSERT_EQUAL(a.bits, b);
         }
     }
diff --git a/test/lol-bench.cpp b/test/lol-bench.cpp
index d64cc77b..5c691207 100644
--- a/test/lol-bench.cpp
+++ b/test/lol-bench.cpp
@@ -12,48 +12,118 @@
 #   include "config.h"
 #endif
 
+#include <cstdio>
+
 #include "core.h"
 #include "loldebug.h"
 
 using namespace std;
 using namespace lol;
 
+static size_t const HALF_TABLE_SIZE = 1024 * 1024;
+static size_t const HALF_RUNS = 100;
+
+static void bench_half(int mode);
+
 int main(int argc, char **argv)
 {
+    Log::Info("-----------------------------------\n");
+    Log::Info("Half precision floats (random bits)\n");
+    Log::Info("-----------------------------------\n");
+    bench_half(1);
+
+    Log::Info("---------------------------------\n");
+    Log::Info("Half precision floats [-2.0, 2.0]\n");
+    Log::Info("---------------------------------\n");
+    bench_half(2);
+
+    return EXIT_SUCCESS;
+}
+
+static void bench_half(int mode)
+{
+    float result[8] = { 0.0f };
     Timer timer;
 
-    float ftotal = 0.0f;
-    for (uint32_t i = 0; i < 0xffffffffu; i += 7)
-    {
-        union { float f; uint32_t x; } u;
-        u.x = i;
+    /* Set up tables */
+    float *pf = new float[HALF_TABLE_SIZE];
+    half *ph = new half[HALF_TABLE_SIZE];
 
-        float h = (float)half::makefast(u.f);
-        ftotal += h;
+    switch (mode)
+    {
+    case 1:
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] = half::makebits(rand());
+        break;
+    case 2:
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] = RandF(-2.0f, 2.0f);
+        break;
     }
-    Log::Info("time for makeslow: %f (hash %f)\n", timer.GetMs(), ftotal);
 
-    uint16_t total = 0;
-    for (uint32_t i = 0; i < 0xffffffffu; i += 7)
+    for (size_t run = 0; run < HALF_RUNS; run++)
     {
-        union { float f; uint32_t x; } u;
-        u.x = i;
+        /* Convert half to float (array) */
+        timer.GetMs();
+        half::convert(pf, ph, HALF_TABLE_SIZE);
+        result[1] += timer.GetMs();
 
-        half h = half::makeslow(u.f);
-        total ^= h.bits;
-    }
-    Log::Info("time for makeslow: %f (hash %04x)\n", timer.GetMs(), total);
+        /* Convert half to float (fast) */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            pf[i] = (float)ph[i];
+        result[0] += timer.GetMs();
 
-    for (uint32_t i = 0; i < 0xffffffffu; i += 7)
-    {
-        union { float f; uint32_t x; } u;
-        u.x = i;
+        /* Convert float to half (array) */
+        timer.GetMs();
+        half::convert(ph, pf, HALF_TABLE_SIZE);
+        result[4] += timer.GetMs();
+
+        /* Convert float to half (fast) */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] = (half)pf[i];
+        result[2] += timer.GetMs();
+
+        /* Convert float to half (slow) */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] = half::makeslow(pf[i]);
+        result[3] += timer.GetMs();
 
-        half h = half::makefast(u.f);
-        total ^= h.bits;
+        /* Change sign of every half */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] = -ph[i];
+        result[5] += timer.GetMs();
+
+        /* Add a half to every float */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            pf[i] += ph[i];
+        result[6] += timer.GetMs();
+
+        /* Add a float to every half */
+        timer.GetMs();
+        for (size_t i = 0; i < HALF_TABLE_SIZE; i++)
+            ph[i] += pf[i];
+        result[7] += timer.GetMs();
     }
-    Log::Info("time for makefast: %f (hash %04x)\n", timer.GetMs(), total);
 
-    return EXIT_SUCCESS;
+    delete[]pf;
+    delete[]ph;
+
+    for (size_t i = 0; i < sizeof(result) / sizeof(*result); i++)
+        result[i] *= 1000000.0f / (HALF_TABLE_SIZE * HALF_RUNS);
+
+    Log::Info("                         ns/elem\n");
+    Log::Info("float = half            %7.3f\n", result[0]);
+    Log::Info("float[] = half[]        %7.3f\n", result[1]);
+    Log::Info("half = float            %7.3f\n", result[2]);
+    Log::Info("half = makeslow(float)  %7.3f\n", result[3]);
+    Log::Info("half[] = float[]        %7.3f\n", result[4]);
+    Log::Info("half = -half            %7.3f\n", result[5]);
+    Log::Info("float += half           %7.3f\n", result[6]);
+    Log::Info("half += float           %7.3f\n", result[7]);
 }