diff --git a/src/core.h b/src/core.h
index ebd7de92..4e8866df 100644
--- a/src/core.h
+++ b/src/core.h
@@ -17,6 +17,9 @@
 #define __LOL_CORE_H__
 
 // CPU features
+#undef LOL_FEATURE_CHEAP_BRANCHES
+#undef LOL_FEATURE_VERY_CHEAP_BRANCHES
+
 #if !defined __CELLOS_LV2__
 #   define LOL_FEATURE_CHEAP_BRANCHES
 #endif
diff --git a/src/trig.cpp b/src/trig.cpp
index 8a397dc3..5ef4b76b 100644
--- a/src/trig.cpp
+++ b/src/trig.cpp
@@ -206,7 +206,6 @@ double lol_sin(double x)
     double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
     sign = lol_fsel(is_even, sign, -sign);
 #else
-    double sign = (x >= 0.0) ? PI : NEG_PI;
     double num_cycles = absx + TWO_EXP_52;
     __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
 
@@ -215,17 +214,19 @@ double lol_sin(double x)
     __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
     __asm__("" : "+m" (is_even));
     is_even -= TWO * num_cycles - ONE;
-    sign *= is_even;
+    double sign = is_even;
 #endif
     absx -= num_cycles;
 
+    /* If branches are very cheap, we have the option to do the Taylor
+     * series at a much lower degree by splitting. */
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
     if (lol_fabs(absx) > QUARTER)
     {
-        sign = (x * absx >= 0.0) ? is_even : -is_even;
+        sign = (x * absx >= 0.0) ? sign : -sign;
 
-        double k = HALF - lol_fabs(absx);
-        double x2 = k * k;
+        double x1 = HALF - lol_fabs(absx);
+        double x2 = x1 * x1;
         double x4 = x2 * x2;
         double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
         double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
@@ -235,6 +236,8 @@ double lol_sin(double x)
     }
 #endif
 
+    sign *= (x >= 0.0) ? PI : NEG_PI;
+
     double x2 = absx * absx;
     double x4 = x2 * x2;
 #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
@@ -249,5 +252,66 @@ double lol_sin(double x)
     return absx * taylor * sign;
 }
 
+double lol_cos(double x)
+{
+    double absx = lol_fabs(x * INV_PI);
+
+#if defined LOL_FEATURE_CHEAP_BRANCHES
+    if (absx < QUARTER)
+    {
+        double x2 = absx * absx;
+        double x4 = x2 * x2;
+        double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
+        double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+        double taylor = (sub1 * x2 + sub2) * x2 + ONE;
+        return taylor;
+    }
+#endif
+
+#if defined __CELLOS_LV2__
+    double num_cycles = lol_round(absx);
+    double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
+    double sign = lol_fsel(is_even, ONE, NEG_ONE);
+#else
+    double num_cycles = absx + TWO_EXP_52;
+    __asm__("" : "+m" (num_cycles)); num_cycles -= TWO_EXP_52;
+
+    double is_even = TWO * num_cycles - ONE;
+    __asm__("" : "+m" (is_even)); is_even += TWO_EXP_54;
+    __asm__("" : "+m" (is_even)); is_even -= TWO_EXP_54;
+    __asm__("" : "+m" (is_even));
+    is_even -= TWO * num_cycles - ONE;
+    double sign = is_even;
+#endif
+    absx -= num_cycles;
+
+#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
+    if (lol_fabs(absx) > QUARTER)
+    {
+        double x1 = HALF - lol_fabs(absx);
+        double x2 = x1 * x1;
+        double x4 = x2 * x2;
+        double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
+        double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
+        double taylor = sub2 * x2 + sub1;
+
+        return x1 * taylor * sign * PI;
+    }
+#endif
+
+    double x2 = absx * absx;
+    double x4 = x2 * x2;
+#if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
+    double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
+    double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
+#else
+    double sub1 = ((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1];
+    double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
+#endif
+    double taylor = (sub1 * x2 + sub2) * x2 + ONE;
+
+    return taylor * sign;
+}
+
 } /* namespace lol */
 
diff --git a/test/lol-bench.cpp b/test/lol-bench.cpp
index eb8c813c..3451195e 100644
--- a/test/lol-bench.cpp
+++ b/test/lol-bench.cpp
@@ -74,7 +74,7 @@ int main(int argc, char **argv)
 
 static void bench_trig(int mode)
 {
-    float result[5] = { 0.0f };
+    float result[7] = { 0.0f };
     Timer timer;
 
     /* Set up tables */
@@ -127,11 +127,27 @@ static void bench_trig(int mode)
             pf2[i] = __builtin_cosf(pf[i]);
         result[3] += timer.GetMs();
 
+        /* Fast cos */
+        timer.GetMs();
+        for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
+#if defined HAVE_FASTMATH_H
+            pf2[i] = f_cosf(pf[i]);
+#else
+            pf2[i] = cosf(pf[i]);
+#endif
+        result[4] += timer.GetMs();
+
+        /* Lol cos */
+        timer.GetMs();
+        for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
+            pf2[i] = lol_cos(pf[i]);
+        result[5] += timer.GetMs();
+
         /* Tan */
         timer.GetMs();
         for (size_t i = 0; i < TRIG_TABLE_SIZE; i++)
             pf2[i] = __builtin_tanf(pf[i]);
-        result[4] += timer.GetMs();
+        result[6] += timer.GetMs();
     }
 
     delete[] pf;
@@ -145,7 +161,9 @@ static void bench_trig(int mode)
     Log::Info("float = fastsinf(float)  %7.3f\n", result[1]);
     Log::Info("float = lol_sinf(float)  %7.3f\n", result[2]);
     Log::Info("float = cosf(float)      %7.3f\n", result[3]);
-    Log::Info("float = tanf(float)      %7.3f\n", result[4]);
+    Log::Info("float = fastcosf(float)  %7.3f\n", result[4]);
+    Log::Info("float = lol_cosf(float)  %7.3f\n", result[5]);
+    Log::Info("float = tanf(float)      %7.3f\n", result[6]);
 }
 
 static void bench_matrix(int mode)
diff --git a/test/trig.cpp b/test/trig.cpp
index b1b56376..4a91ab9a 100644
--- a/test/trig.cpp
+++ b/test/trig.cpp
@@ -54,6 +54,22 @@ public:
             double b = lol_sin(f);
             CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
         }
+
+        for (int i = -10000; i < 10000; i++)
+        {
+            double f = (double)i * (1.0 / 1000.0);
+            double a = __builtin_cos(f);
+            double b = lol_cos(f);
+            CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
+        }
+
+        for (int i = -10000; i < 10000; i++)
+        {
+            double f = (double)i * (1.0 / 100000.0);
+            double a = __builtin_cos(f);
+            double b = lol_cos(f);
+            CPPUNIT_ASSERT(fabs(a - b) <= fabs(f) * 1e-11);
+        }
     }
 };