5 files changed, 32 insertions, 574 deletions
diff --git a/bench/BlurRectBench.cpp b/bench/BlurRectBench.cpp
index fec0257e2e..3d0a896283 100644
--- a/bench/BlurRectBench.cpp
+++ b/bench/BlurRectBench.cpp
@@ -140,7 +140,7 @@ protected:
     virtual void makeBlurryRect(const SkRect&) SK_OVERRIDE {
         SkMask mask;
         mask.fImage = NULL;
-        SkBlurMask::BlurSeparable(&mask, fSrcMask, this->radius(),
+        SkBlurMask::Blur(&mask, fSrcMask, this->radius(),
                                   SkBlurMask::kNormal_Style,
                                   SkBlurMask::kHigh_Quality);
         SkMask::FreeImage(mask.fImage);
diff --git a/gm/blurrect.cpp b/gm/blurrect.cpp
index 056d89ca61..5a18d16fad 100644
--- a/gm/blurrect.cpp
+++ b/gm/blurrect.cpp
@@ -243,7 +243,7 @@ protected:
 
         memset(src.fImage, 0xff, src.computeTotalImageSize());
 
-        return SkBlurMask::BlurSeparable(m, src, this->radius(), this->style(), this->getQuality());
+        return SkBlurMask::Blur(m, src, this->radius(), this->style(), this->getQuality());
     }
 
     virtual SkBlurMask::Quality getQuality() {
diff --git a/src/effects/SkBlurMask.cpp b/src/effects/SkBlurMask.cpp
index 8a92eef027..0089bab96e 100644
--- a/src/effects/SkBlurMask.cpp
+++ b/src/effects/SkBlurMask.cpp
@@ -412,482 +412,6 @@ static void get_adjusted_radii(SkScalar passRadius, int *loRadius, int *hiRadius
     }
 }
 
-// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,
-// breakeven on Mac, and ~15% slowdown on Linux.
-// Reading a word at a time when bulding the sum buffer seems to give
-// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.
-#if defined(SK_BUILD_FOR_WIN32)
-#define UNROLL_KERNEL_LOOP 1
-#endif
-
-/** The sum buffer is an array of u32 to hold the accumulated sum of all of the
-    src values at their position, plus all values above and to the left.
-    When we sample into this buffer, we need an initial row and column of 0s,
-    so we have an index correspondence as follows:
-
-    src[i, j] == sum[i+1, j+1]
-    sum[0, j] == sum[i, 0] == 0
-
-    We assume that the sum buffer's stride == its width
- */
-static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,
-                             const uint8_t src[], int srcRB) {
-    int sumW = srcW + 1;
-
-    SkASSERT(srcRB >= srcW);
-    // mod srcRB so we can apply it after each row
-    srcRB -= srcW;
-
-    int x, y;
-
-    // zero out the top row and column
-    memset(sum, 0, sumW * sizeof(sum[0]));
-    sum += sumW;
-
-    // special case first row
-    uint32_t X = 0;
-    *sum++ = 0; // initialze the first column to 0
-    for (x = srcW - 1; x >= 0; --x) {
-        X = *src++ + X;
-        *sum++ = X;
-    }
-    src += srcRB;
-
-    // now do the rest of the rows
-    for (y = srcH - 1; y > 0; --y) {
-        uint32_t L = 0;
-        uint32_t C = 0;
-        *sum++ = 0; // initialze the first column to 0
-
-        for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {
-            uint32_t T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-        }
-
-        for (; x >= 4; x-=4) {
-            uint32_t T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-            T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-            T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-            T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-        }
-
-        for (; x >= 0; --x) {
-            uint32_t T = sum[-sumW];
-            X = *src++ + L + T - C;
-            *sum++ = X;
-            L = X;
-            C = T;
-        }
-        src += srcRB;
-    }
-}
-
-/**
- * This is the path for apply_kernel() to be taken when the kernel
- * is wider than the source image.
- */
-static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
-                           int sw, int sh) {
-    SkASSERT(2*rx > sw);
-
-    uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));
-
-    int sumStride = sw + 1;
-
-    int dw = sw + 2*rx;
-    int dh = sh + 2*ry;
-
-    int prev_y = -2*ry;
-    int next_y = 1;
-
-    for (int y = 0; y < dh; ++y) {
-        int py = SkClampPos(prev_y) * sumStride;
-        int ny = SkFastMin32(next_y, sh) * sumStride;
-
-        int prev_x = -2*rx;
-        int next_x = 1;
-
-        for (int x = 0; x < dw; ++x) {
-            int px = SkClampPos(prev_x);
-            int nx = SkFastMin32(next_x, sw);
-
-            // TODO: should we be adding 1/2 (1 << 23) to round to the
-            // nearest integer here?
-            uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        prev_y += 1;
-        next_y += 1;
-    }
-}
-/**
- *  sw and sh are the width and height of the src. Since the sum buffer
- *  matches that, but has an extra row and col at the beginning (with zeros),
- *  we can just use sw and sh as our "max" values for pinning coordinates
- *  when sampling into sum[][]
- *
- *  The inner loop is conceptually simple; we break it into several sections
- *  to improve performance. Here's the original version:
-        for (int x = 0; x < dw; ++x) {
-            int px = SkClampPos(prev_x);
-            int nx = SkFastMin32(next_x, sw);
-
-            uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
- *  The sections are:
- *     left-hand section, where prev_x is clamped to 0
- *     center section, where neither prev_x nor next_x is clamped
- *     right-hand section, where next_x is clamped to sw
- *  On some operating systems, the center section is unrolled for additional
- *  speedup.
-*/
-static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
-                         int sw, int sh) {
-    if (2*rx > sw) {
-        kernel_clamped(dst, rx, ry, sum, sw, sh);
-        return;
-    }
-
-    uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));
-
-    int sumStride = sw + 1;
-
-    int dw = sw + 2*rx;
-    int dh = sh + 2*ry;
-
-    int prev_y = -2*ry;
-    int next_y = 1;
-
-    SkASSERT(2*rx <= dw - 2*rx);
-
-    for (int y = 0; y < dh; ++y) {
-        int py = SkClampPos(prev_y) * sumStride;
-        int ny = SkFastMin32(next_y, sh) * sumStride;
-
-        int prev_x = -2*rx;
-        int next_x = 1;
-        int x = 0;
-
-        for (; x < 2*rx; ++x) {
-            SkASSERT(prev_x <= 0);
-            SkASSERT(next_x <= sw);
-
-            int px = 0;
-            int nx = next_x;
-
-            uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        int i0 = prev_x + py;
-        int i1 = next_x + ny;
-        int i2 = next_x + py;
-        int i3 = prev_x + ny;
-
-#if UNROLL_KERNEL_LOOP
-        for (; x < dw - 2*rx - 4; x += 4) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x <= sw);
-
-            uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            *dst++ = SkToU8(tmp * scale >> 24);
-            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            *dst++ = SkToU8(tmp * scale >> 24);
-            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            *dst++ = SkToU8(tmp * scale >> 24);
-            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 4;
-            next_x += 4;
-        }
-#endif
-
-        for (; x < dw - 2*rx; ++x) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x <= sw);
-
-            uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        for (; x < dw; ++x) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x > sw);
-
-            int px = prev_x;
-            int nx = sw;
-
-            uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
-            *dst++ = SkToU8(tmp * scale >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        prev_y += 1;
-        next_y += 1;
-    }
-}
-
-/**
- * This is the path for apply_kernel_interp() to be taken when the kernel
- * is wider than the source image.
- */
-static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,
-                const uint32_t sum[], int sw, int sh, U8CPU outerWeight) {
-    SkASSERT(2*rx > sw);
-
-    int innerWeight = 255 - outerWeight;
-
-    // round these guys up if they're bigger than 127
-    outerWeight += outerWeight >> 7;
-    innerWeight += innerWeight >> 7;
-
-    uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1));
-    uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1));
-
-    int sumStride = sw + 1;
-
-    int dw = sw + 2*rx;
-    int dh = sh + 2*ry;
-
-    int prev_y = -2*ry;
-    int next_y = 1;
-
-    for (int y = 0; y < dh; ++y) {
-        int py = SkClampPos(prev_y) * sumStride;
-        int ny = SkFastMin32(next_y, sh) * sumStride;
-
-        int ipy = SkClampPos(prev_y + 1) * sumStride;
-        int iny = SkClampMax(next_y - 1, sh) * sumStride;
-
-        int prev_x = -2*rx;
-        int next_x = 1;
-
-        for (int x = 0; x < dw; ++x) {
-            int px = SkClampPos(prev_x);
-            int nx = SkFastMin32(next_x, sw);
-
-            int ipx = SkClampPos(prev_x + 1);
-            int inx = SkClampMax(next_x - 1, sw);
-
-            uint32_t outerSum = sum[px+py] + sum[nx+ny]
-                               - sum[nx+py] - sum[px+ny];
-            uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]
-                               - sum[inx+ipy] - sum[ipx+iny];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-        prev_y += 1;
-        next_y += 1;
-    }
-}
-
-/**
- *  sw and sh are the width and height of the src. Since the sum buffer
- *  matches that, but has an extra row and col at the beginning (with zeros),
- *  we can just use sw and sh as our "max" values for pinning coordinates
- *  when sampling into sum[][]
- *
- *  The inner loop is conceptually simple; we break it into several variants
- *  to improve performance. Here's the original version:
-        for (int x = 0; x < dw; ++x) {
-            int px = SkClampPos(prev_x);
-            int nx = SkFastMin32(next_x, sw);
-
-            int ipx = SkClampPos(prev_x + 1);
-            int inx = SkClampMax(next_x - 1, sw);
-
-            uint32_t outerSum = sum[px+py] + sum[nx+ny]
-                               - sum[nx+py] - sum[px+ny];
-            uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]
-                               - sum[inx+ipy] - sum[ipx+iny];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
- *  The sections are:
- *     left-hand section, where prev_x is clamped to 0
- *     center section, where neither prev_x nor next_x is clamped
- *     right-hand section, where next_x is clamped to sw
- *  On some operating systems, the center section is unrolled for additional
- *  speedup.
-*/
-static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
-                const uint32_t sum[], int sw, int sh, U8CPU outerWeight) {
-    SkASSERT(rx > 0 && ry > 0);
-    SkASSERT(outerWeight <= 255);
-
-    if (2*rx > sw) {
-        kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outerWeight);
-        return;
-    }
-
-    int innerWeight = 255 - outerWeight;
-
-    // round these guys up if they're bigger than 127
-    outerWeight += outerWeight >> 7;
-    innerWeight += innerWeight >> 7;
-
-    uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1));
-    uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1));
-
-    int sumStride = sw + 1;
-
-    int dw = sw + 2*rx;
-    int dh = sh + 2*ry;
-
-    int prev_y = -2*ry;
-    int next_y = 1;
-
-    SkASSERT(2*rx <= dw - 2*rx);
-
-    for (int y = 0; y < dh; ++y) {
-        int py = SkClampPos(prev_y) * sumStride;
-        int ny = SkFastMin32(next_y, sh) * sumStride;
-
-        int ipy = SkClampPos(prev_y + 1) * sumStride;
-        int iny = SkClampMax(next_y - 1, sh) * sumStride;
-
-        int prev_x = -2*rx;
-        int next_x = 1;
-        int x = 0;
-
-        for (; x < 2*rx; ++x) {
-            SkASSERT(prev_x < 0);
-            SkASSERT(next_x <= sw);
-
-            int px = 0;
-            int nx = next_x;
-
-            int ipx = 0;
-            int inx = next_x - 1;
-
-            uint32_t outerSum = sum[px+py] + sum[nx+ny]
-                               - sum[nx+py] - sum[px+ny];
-            uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]
-                               - sum[inx+ipy] - sum[ipx+iny];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        int i0 = prev_x + py;
-        int i1 = next_x + ny;
-        int i2 = next_x + py;
-        int i3 = prev_x + ny;
-        int i4 = prev_x + 1 + ipy;
-        int i5 = next_x - 1 + iny;
-        int i6 = next_x - 1 + ipy;
-        int i7 = prev_x + 1 + iny;
-
-#if UNROLL_KERNEL_LOOP
-        for (; x < dw - 2*rx - 4; x += 4) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x <= sw);
-
-            uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-            outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-            outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-            outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 4;
-            next_x += 4;
-        }
-#endif
-
-        for (; x < dw - 2*rx; ++x) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x <= sw);
-
-            uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
-            uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        for (; x < dw; ++x) {
-            SkASSERT(prev_x >= 0);
-            SkASSERT(next_x > sw);
-
-            int px = prev_x;
-            int nx = sw;
-
-            int ipx = prev_x + 1;
-            int inx = sw;
-
-            uint32_t outerSum = sum[px+py] + sum[nx+ny]
-                               - sum[nx+py] - sum[px+ny];
-            uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]
-                               - sum[inx+ipy] - sum[ipx+iny];
-            *dst++ = SkToU8((outerSum * outerScale
-                           + innerSum * innerScale) >> 24);
-
-            prev_x += 1;
-            next_x += 1;
-        }
-
-        prev_y += 1;
-        next_y += 1;
-    }
-}
-
 #include "SkColorPriv.h"
 
 static void merge_src_with_blur(uint8_t dst[], int dstRB,
@@ -955,7 +479,7 @@ void SkMask_FreeImage(uint8_t* image) {
 
 bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,
                       SkScalar radius, Style style, Quality quality,
-                      SkIPoint* margin, bool separable)
+                      SkIPoint* margin)
 {
 
     if (src.fFormat != SkMask::kA8_Format) {
@@ -1011,77 +535,39 @@ bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,
         SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dp);
 
         // build the blurry destination
-        if (separable) {
-            SkAutoTMalloc<uint8_t>  tmpBuffer(dstSize);
-            uint8_t*                tp = tmpBuffer.get();
-            int w = sw, h = sh;
-
-            if (outerWeight == 255) {
-                int loRadius, hiRadius;
-                get_adjusted_radii(passRadius, &loRadius, &hiRadius);
-                if (kHigh_Quality == quality) {
-                    // Do three X blurs, with a transpose on the final one.
-                    w = boxBlur(sp, src.fRowBytes, tp, loRadius, hiRadius, w, h, false);
-                    w = boxBlur(tp, w,             dp, hiRadius, loRadius, w, h, false);
-                    w = boxBlur(dp, w,             tp, hiRadius, hiRadius, w, h, true);
-                    // Do three Y blurs, with a transpose on the final one.
-                    h = boxBlur(tp, h,             dp, loRadius, hiRadius, h, w, false);
-                    h = boxBlur(dp, h,             tp, hiRadius, loRadius, h, w, false);
-                    h = boxBlur(tp, h,             dp, hiRadius, hiRadius, h, w, true);
-                } else {
-                    w = boxBlur(sp, src.fRowBytes, tp, rx, rx, w, h, true);
-                    h = boxBlur(tp, h,             dp, ry, ry, h, w, true);
-                }
+        SkAutoTMalloc<uint8_t>  tmpBuffer(dstSize);
+        uint8_t*                tp = tmpBuffer.get();
+        int w = sw, h = sh;
+
+        if (outerWeight == 255) {
+            int loRadius, hiRadius;
+            get_adjusted_radii(passRadius, &loRadius, &hiRadius);
+            if (kHigh_Quality == quality) {
+                // Do three X blurs, with a transpose on the final one.
+                w = boxBlur(sp, src.fRowBytes, tp, loRadius, hiRadius, w, h, false);
+                w = boxBlur(tp, w,             dp, hiRadius, loRadius, w, h, false);
+                w = boxBlur(dp, w,             tp, hiRadius, hiRadius, w, h, true);
+                // Do three Y blurs, with a transpose on the final one.
+                h = boxBlur(tp, h,             dp, loRadius, hiRadius, h, w, false);
+                h = boxBlur(dp, h,             tp, hiRadius, loRadius, h, w, false);
+                h = boxBlur(tp, h,             dp, hiRadius, hiRadius, h, w, true);
             } else {
-                if (kHigh_Quality == quality) {
-                    // Do three X blurs, with a transpose on the final one.
-                    w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, false, outerWeight);
-                    w = boxBlurInterp(tp, w,             dp, rx, w, h, false, outerWeight);
-                    w = boxBlurInterp(dp, w,             tp, rx, w, h, true, outerWeight);
-                    // Do three Y blurs, with a transpose on the final one.
-                    h = boxBlurInterp(tp, h,             dp, ry, h, w, false, outerWeight);
-                    h = boxBlurInterp(dp, h,             tp, ry, h, w, false, outerWeight);
-                    h = boxBlurInterp(tp, h,             dp, ry, h, w, true, outerWeight);
-                } else {
-                    w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, true, outerWeight);
-                    h = boxBlurInterp(tp, h,             dp, ry, h, w, true, outerWeight);
-                }
+                w = boxBlur(sp, src.fRowBytes, tp, rx, rx, w, h, true);
+                h = boxBlur(tp, h,             dp, ry, ry, h, w, true);
             }
         } else {
-            const size_t storageW = sw + 2 * (passCount - 1) * rx + 1;
-            const size_t storageH = sh + 2 * (passCount - 1) * ry + 1;
-            SkAutoTMalloc<uint32_t> storage(storageW * storageH);
-            uint32_t*               sumBuffer = storage.get();
-
-            //pass1: sp is source, dp is destination
-            build_sum_buffer(sumBuffer, sw, sh, sp, src.fRowBytes);
-            if (outerWeight == 255) {
-                apply_kernel(dp, rx, ry, sumBuffer, sw, sh);
-            } else {
-                apply_kernel_interp(dp, rx, ry, sumBuffer, sw, sh, outerWeight);
-            }
-
             if (kHigh_Quality == quality) {
-                //pass2: dp is source, tmpBuffer is destination
-                int tmp_sw = sw + 2 * rx;
-                int tmp_sh = sh + 2 * ry;
-                SkAutoTMalloc<uint8_t>  tmpBuffer(dstSize);
-                build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, dp, tmp_sw);
-                if (outerWeight == 255)
-                    apply_kernel(tmpBuffer.get(), rx, ry, sumBuffer, tmp_sw, tmp_sh);
-                else
-                    apply_kernel_interp(tmpBuffer.get(), rx, ry, sumBuffer,
-                                        tmp_sw, tmp_sh, outerWeight);
-
-                //pass3: tmpBuffer is source, dp is destination
-                tmp_sw += 2 * rx;
-                tmp_sh += 2 * ry;
-                build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, tmpBuffer.get(), tmp_sw);
-                if (outerWeight == 255)
-                    apply_kernel(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh);
-                else
-                    apply_kernel_interp(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh,
-                                        outerWeight);
+                // Do three X blurs, with a transpose on the final one.
+                w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, false, outerWeight);
+                w = boxBlurInterp(tp, w,             dp, rx, w, h, false, outerWeight);
+                w = boxBlurInterp(dp, w,             tp, rx, w, h, true, outerWeight);
+                // Do three Y blurs, with a transpose on the final one.
+                h = boxBlurInterp(tp, h,             dp, ry, h, w, false, outerWeight);
+                h = boxBlurInterp(dp, h,             tp, ry, h, w, false, outerWeight);
+                h = boxBlurInterp(tp, h,             dp, ry, h, w, true, outerWeight);
+            } else {
+                w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, true, outerWeight);
+                h = boxBlurInterp(tp, h,             dp, ry, h, w, true, outerWeight);
             }
         }
 
@@ -1115,20 +601,6 @@ bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,
     return true;
 }
 
-bool SkBlurMask::BlurSeparable(SkMask* dst, const SkMask& src,
-                               SkScalar radius, Style style, Quality quality,
-                               SkIPoint* margin)
-{
-    return SkBlurMask::Blur(dst, src, radius, style, quality, margin, true);
-}
-
-bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,
-                     SkScalar radius, Style style, Quality quality,
-                     SkIPoint* margin)
-{
-    return SkBlurMask::Blur(dst, src, radius, style, quality, margin, false);
-}
-
 /* Convolving a box with itself three times results in a piecewise
    quadratic function:
 
diff --git a/src/effects/SkBlurMask.h b/src/effects/SkBlurMask.h
index b60c2aadcc..9c625adad8 100644
--- a/src/effects/SkBlurMask.h
+++ b/src/effects/SkBlurMask.h
@@ -36,10 +36,6 @@ public:
     static bool Blur(SkMask* dst, const SkMask& src,
                      SkScalar radius, Style style, Quality quality,
                      SkIPoint* margin = NULL);
-    static bool BlurSeparable(SkMask* dst, const SkMask& src,
-                              SkScalar radius, Style style, Quality quality,
-                              SkIPoint* margin = NULL);
-
 
     // the "ground truth" blur does a gaussian convolution; it's slow
     // but useful for comparison purposes.
@@ -47,11 +43,6 @@ public:
     static bool BlurGroundTruth(SkMask* dst, const SkMask& src,
                            SkScalar provided_radius, Style style,
                            SkIPoint* margin = NULL);
-
-private:
-    static bool Blur(SkMask* dst, const SkMask& src,
-                     SkScalar radius, Style style, Quality quality,
-                     SkIPoint* margin, bool separable);
 };
 
 #endif
diff --git a/src/effects/SkBlurMaskFilter.cpp b/src/effects/SkBlurMaskFilter.cpp
index 3d0fb81e48..eaf7704bce 100644
--- a/src/effects/SkBlurMaskFilter.cpp
+++ b/src/effects/SkBlurMaskFilter.cpp
@@ -106,13 +106,8 @@ bool SkBlurMaskFilterImpl::filterMask(SkMask* dst, const SkMask& src,
         (fBlurFlags & SkBlurMaskFilter::kHighQuality_BlurFlag) ?
             SkBlurMask::kHigh_Quality : SkBlurMask::kLow_Quality;
 
-#ifndef SK_DISABLE_SEPARABLE_MASK_BLUR
-    return SkBlurMask::BlurSeparable(dst, src, radius, (SkBlurMask::Style)fBlurStyle,
-                            blurQuality, margin);
-#else
     return SkBlurMask::Blur(dst, src, radius, (SkBlurMask::Style)fBlurStyle,
                             blurQuality, margin);
-#endif
 }
 
 bool SkBlurMaskFilterImpl::filterRectMask(SkMask* dst, const SkRect& r,