diff --git a/pipi/filter/blur.c b/pipi/filter/blur.c
index 3561071..2a61940 100644
--- a/pipi/filter/blur.c
+++ b/pipi/filter/blur.c
@@ -40,153 +40,48 @@ pipi_image_t *pipi_gaussian_blur(pipi_image_t *src, float radius)
 pipi_image_t *pipi_gaussian_blur_ext(pipi_image_t *src, float rx, float ry,
                                      float dx, float dy)
 {
-    pipi_image_t *dst;
-    pipi_pixels_t *srcp, *dstp;
-    float *srcdata, *dstdata;
-    double *kernel, *buffer;
-    double K;
-    int x, y, i, w, h, kr, kw, gray;
+    pipi_image_t *ret;
+    double *kernel;
+    double Kx, Ky, t = 0.0;
+    int i, j, krx, kry, m, n;
 
     if(rx < BLUR_EPSILON) rx = BLUR_EPSILON;
     if(ry < BLUR_EPSILON) ry = BLUR_EPSILON;
 
-    w = src->w;
-    h = src->h;
-
-    gray = (src->last_modified == PIPI_PIXELS_Y_F);
-
-    srcp = gray ? pipi_getpixels(src, PIPI_PIXELS_Y_F)
-                : pipi_getpixels(src, PIPI_PIXELS_RGBA_F);
-    srcdata = (float *)srcp->pixels;
-
-    dst = pipi_new(w, h);
-    dstp = gray ? pipi_getpixels(dst, PIPI_PIXELS_Y_F)
-                : pipi_getpixels(dst, PIPI_PIXELS_RGBA_F);
-    dstdata = (float *)dstp->pixels;
-
-    buffer = malloc(w * h * (gray ? 1 : 4) * sizeof(double));
-
     /* FIXME: the kernel becomes far too big with large values of dx, because
      * we grow both left and right. Fix the growing direction. */
-    kr = (int)(3. * rx + .99999 + ceil(abs(dx)));
-    kw = 2 * kr + 1;
-    K = -1. / (2. * rx * rx);
+    krx = (int)(3. * rx + .99999 + ceil(abs(dx)));
+    m = 2 * krx + 1;
+    Kx = -1. / (2. * rx * rx);
+
+    kry = (int)(3. * ry + .99999 + ceil(abs(dy)));
+    n = 2 * kry + 1;
+    Ky = -1. / (2. * ry * ry);
 
-    kernel = malloc(kw * sizeof(double));
-    for(i = -kr; i <= kr; i++)
-        kernel[i + kr] = exp(K * ((double)i + dx) * ((double)i + dx));
+    kernel = malloc(m * n * sizeof(double));
 
-    for(y = 0; y < h; y++)
+    for(j = -kry; j <= kry; j++)
     {
-        for(x = 0; x < w; x++)
+        double ey = Ky * ((double)j + dy) * ((double)j + dy);
+
+        for(i = -krx; i <= krx; i++)
         {
-            if(gray)
-            {
-                double Y = 0., t = 0.;
-                int x2;
-
-                for(i = -kr; i <= kr; i++)
-                {
-                    double f = kernel[i + kr];
-
-                    x2 = x + i;
-                    if(x2 < 0) x2 = 0;
-                    else if(x2 >= w) x2 = w - 1;
-
-                    Y += f * srcdata[y * w + x2];
-                    t += f;
-                }
-
-                buffer[y * w + x] = Y / t;
-            }
-            else
-            {
-                double R = 0., G = 0., B = 0., t = 0.;
-                int x2, off = 4 * (y * w + x);
-
-                for(i = -kr; i <= kr; i++)
-                {
-                    double f = kernel[i + kr];
-
-                    x2 = x + i;
-                    if(x2 < 0) x2 = 0;
-                    else if(x2 >= w) x2 = w - 1;
-
-                    R += f * srcdata[(y * w + x2) * 4];
-                    G += f * srcdata[(y * w + x2) * 4 + 1];
-                    B += f * srcdata[(y * w + x2) * 4 + 2];
-                    t += f;
-                }
-
-                buffer[off] = R / t;
-                buffer[off + 1] = G / t;
-                buffer[off + 2] = B / t;
-            }
+            double ex = Kx * ((double)i + dx) * ((double)i + dx);
+            double d = exp(ex + ey);
+
+            kernel[(j + kry) * m + i + krx] = d;
+            t += d;
         }
     }
 
-    free(kernel);
-
-    kr = (int)(3. * ry + .99999 + ceil(abs(dy)));
-    kw = 2 * kr + 1;
-    K = -1. / (2. * ry * ry);
+    for(j = 0; j < n; j++)
+        for(i = 0; i < m; i++)
+            kernel[j * m + i] /= t;
 
-    kernel = malloc(kw * sizeof(double));
-    for(i = -kr; i <= kr; i++)
-        kernel[i + kr] = exp(K * ((double)i + dy) * ((double)i + dy));
-
-    for(y = 0; y < h; y++)
-    {
-        for(x = 0; x < w; x++)
-        {
-            if(gray)
-            {
-                double Y = 0., t = 0.;
-                int y2;
-
-                for(i = -kr; i <= kr; i++)
-                {
-                    double f = kernel[i + kr];
-
-                    y2 = y + i;
-                    if(y2 < 0) y2 = 0;
-                    else if(y2 >= h) y2 = h - 1;
-
-                    Y += f * buffer[y2 * w + x];
-                    t += f;
-                }
-
-                dstdata[y * w + x] = Y / t;
-            }
-            else
-            {
-                double R = 0., G = 0., B = 0., t = 0.;
-                int y2, off = 4 * (y * w + x);
-
-                for(i = -kr; i <= kr; i++)
-                {
-                    double f = kernel[i + kr];
-
-                    y2 = y + i;
-                    if(y2 < 0) y2 = 0;
-                    else if(y2 >= h) y2 = h - 1;
-
-                    R += f * buffer[(y2 * w + x) * 4];
-                    G += f * buffer[(y2 * w + x) * 4 + 1];
-                    B += f * buffer[(y2 * w + x) * 4 + 2];
-                    t += f;
-                }
-
-                dstdata[off] = R / t;
-                dstdata[off + 1] = G / t;
-                dstdata[off + 2] = B / t;
-            }
-        }
-    }
+    ret = pipi_convolution(src, m, n, kernel);
 
-    free(buffer);
     free(kernel);
 
-    return dst;
+    return ret;
 }
 
diff --git a/pipi/filter/convolution.c b/pipi/filter/convolution.c
index 143f304..2509bfa 100644
--- a/pipi/filter/convolution.c
+++ b/pipi/filter/convolution.c
@@ -74,8 +74,6 @@ pipi_image_t *pipi_convolution(pipi_image_t *src, int m, int n, double mat[])
         }
     }
 
-printf("separabble!\n");
-
     /* Matrix rank is 1! Separate the filter */
     /* FIXME: memleak */
     hvec = malloc(m * sizeof(double));