15 files changed, 1692 insertions, 268 deletions
diff --git a/gm/downsamplebitmap.cpp b/gm/downsamplebitmap.cpp
index a59e5b85df..e34effa07f 100644
--- a/gm/downsamplebitmap.cpp
+++ b/gm/downsamplebitmap.cpp
@@ -75,7 +75,7 @@ protected:
             curWidth = (int) (fBM.width() * curScale + 2);
             curX += curWidth;
             curScale *= 0.75f;
-        } while (curX < 4*fBM.width());
+        } while (curWidth >= 2 && curX < 4*fBM.width());
     }
 
 private:
diff --git a/gyp/core.gypi b/gyp/core.gypi
index bf5e245924..eac96f6f9f 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -32,6 +32,8 @@
         '<(skia_src_path)/core/SkBitmapProcState_matrix.h',
         '<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp',
         '<(skia_src_path)/core/SkBitmapProcState_sample.h',
+        '<(skia_src_path)/core/SkBitmapScaler.h',
+        '<(skia_src_path)/core/SkBitmapScaler.cpp',        
         '<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h',
         '<(skia_src_path)/core/SkBitmapShaderTemplate.h',
         '<(skia_src_path)/core/SkBitmap_scroll.cpp',
@@ -56,6 +58,8 @@
         '<(skia_src_path)/core/SkComposeShader.cpp',
         '<(skia_src_path)/core/SkConfig8888.cpp',
         '<(skia_src_path)/core/SkConfig8888.h',
+        '<(skia_src_path)/core/SkConvolver.cpp',
+        '<(skia_src_path)/core/SkConvolver.h',
         '<(skia_src_path)/core/SkCordic.cpp',
         '<(skia_src_path)/core/SkCordic.h',
         '<(skia_src_path)/core/SkCoreBlitters.h',
diff --git a/include/core/SkBitmap.h b/include/core/SkBitmap.h
index d5277c6c80..6d368f5b49 100644
--- a/include/core/SkBitmap.h
+++ b/include/core/SkBitmap.h
@@ -702,19 +702,7 @@ private:
     int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy);
     bool hasMipMap() const;
     void freeMipMap();
-
-    /** Make a scaled copy of this bitmap into the provided destination.
-      * The caller is responsible for having set the width and height of the
-      * provided destination bitmap, and also having allocated its pixel
-      * memory.
-      *
-      * This function is temporary and for testing purposes only; it will
-      * likely move once it has been properly plumbed into the bitmap
-      * shader infrastructure.
-      */
-
-    void scale(SkBitmap *dst) const;
-
+    
     friend struct SkBitmapProcState;
 };
 
diff --git a/src/core/SkBitmapFilter.cpp b/src/core/SkBitmapFilter.cpp
index 434ea9a536..060400944f 100644
--- a/src/core/SkBitmapFilter.cpp
+++ b/src/core/SkBitmapFilter.cpp
@@ -5,15 +5,23 @@
  * found in the LICENSE file.
  */
 
+#include "SkErrorInternals.h"
+#include "SkConvolver.h"
 #include "SkBitmapProcState.h"
 #include "SkBitmap.h"
 #include "SkColor.h"
 #include "SkColorPriv.h"
+#include "SkConvolver.h"
 #include "SkUnPreMultiply.h"
 #include "SkShader.h"
 #include "SkRTConf.h"
 #include "SkMath.h"
 
+// These are the per-scanline callbacks that are used when we must resort to
+// resampling an image as it is blitted.  Typically these are used only when
+// the image is rotated or has some other complex transformation applied.
+// Scaled images will usually be rescaled directly before rasterization.
+
 void highQualityFilter(const SkBitmapProcState& s, int x, int y,
                    SkPMColor* SK_RESTRICT colors, int count) {
 
@@ -68,71 +76,15 @@ void highQualityFilter(const SkBitmapProcState& s, int x, int y,
     }
 }
 
-void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
-                             SkPMColor *SK_RESTRICT colors, int count) {
-     const int maxX = s.fBitmap->width() - 1;
-     const int maxY = s.fBitmap->height() - 1;
-
-     SkPoint srcPt;
-
-     s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
-                 SkFloatToScalar(y + 0.5f), &srcPt);
-     srcPt.fY -= SK_ScalarHalf;
-     int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
-     int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()), maxY);
-
-     while (count-- > 0) {
-         s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
-                     SkFloatToScalar(y + 0.5f), &srcPt);
-         srcPt.fX -= SK_ScalarHalf;
-         srcPt.fY -= SK_ScalarHalf;
-
-         SkScalar weight = 0;
-         SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
-
-         int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
-         int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width()), maxX);
-
-         for (int srcY = y0; srcY <= y1; srcY++) {
-             SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
-
-             for (int srcX = x0; srcX <= x1 ; srcX++) {
-                 SkScalar xWeight = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
-
-                 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
-
-                 SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
-                 fr += combined_weight * SkGetPackedR32(c);
-                 fg += combined_weight * SkGetPackedG32(c);
-                 fb += combined_weight * SkGetPackedB32(c);
-                 fa += combined_weight * SkGetPackedA32(c);
-                 weight += combined_weight;
-             }
-         }
-
-         fr = SkScalarDiv(fr, weight);
-         fg = SkScalarDiv(fg, weight);
-         fb = SkScalarDiv(fb, weight);
-         fa = SkScalarDiv(fa, weight);
-
-         int a = SkClampMax(SkScalarRoundToInt(fa), 255);
-         int r = SkClampMax(SkScalarRoundToInt(fr), a);
-         int g = SkClampMax(SkScalarRoundToInt(fg), a);
-         int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
-         *colors++ = SkPackARGB32(a, r, g, b);
-
-         x++;
-     }
-}
-
-SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which bitmap filter to use [mitchell, sinc, gaussian, triangle, box]");
+SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which scanline bitmap filter to use [mitchell, lanczos, hamming, gaussian, triangle, box]");
 
-static SkBitmapFilter *allocateBitmapFilter() {
+SkBitmapFilter *SkBitmapFilter::Allocate() {
     if (!strcmp(c_bitmapFilter, "mitchell")) {
         return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f));
-    } else if (!strcmp(c_bitmapFilter, "sinc")) {
-        return SkNEW_ARGS(SkSincFilter,(3));
+    } else if (!strcmp(c_bitmapFilter, "lanczos")) {
+        return SkNEW(SkLanczosFilter);
+    } else if (!strcmp(c_bitmapFilter, "hamming")) {
+        return SkNEW(SkHammingFilter);
     } else if (!strcmp(c_bitmapFilter, "gaussian")) {
         return SkNEW_ARGS(SkGaussianFilter,(2));
     } else if (!strcmp(c_bitmapFilter, "triangle")) {
@@ -168,159 +120,12 @@ SkBitmapProcState::chooseBitmapFilterProc() {
     }
 
     if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) {
-        fBitmapFilter = allocateBitmapFilter();
+        fBitmapFilter = SkBitmapFilter::Allocate();
     }
 
-    if (fInvType & SkMatrix::kAffine_Mask) {
+    if (fInvType & SkMatrix::kScale_Mask) {
         return highQualityFilter;
-    } else if (fInvType & SkMatrix::kScale_Mask) {
-        return highQualityFilter_ScaleOnly;
     } else {
         return NULL;
     }
 }
-
-static void divideByWeights(SkScalar *sums, SkScalar *weights, SkBitmap *dst) {
-    for (int y = 0 ; y < dst->height() ; y++) {
-        for (int x = 0 ; x < dst->width() ; x++) {
-            SkScalar fr = SkScalarDiv(sums[4*(y*dst->width() + x) + 0], weights[y*dst->width() + x]);
-            SkScalar fg = SkScalarDiv(sums[4*(y*dst->width() + x) + 1], weights[y*dst->width() + x]);
-            SkScalar fb = SkScalarDiv(sums[4*(y*dst->width() + x) + 2], weights[y*dst->width() + x]);
-            SkScalar fa = SkScalarDiv(sums[4*(y*dst->width() + x) + 3], weights[y*dst->width() + x]);
-            int a = SkClampMax(SkScalarRoundToInt(fa), 255);
-            int r = SkClampMax(SkScalarRoundToInt(fr), a);
-            int g = SkClampMax(SkScalarRoundToInt(fg), a);
-            int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
-            *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
-        }
-    }
-}
-
-static void upScaleHorizTranspose(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
-    for (int y = 0 ; y < dst->height() ; y++) {
-        for (int x = 0 ; x < dst->width() ; x++) {
-            float sx = (y + 0.5f) / scale - 0.5f;
-            int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->width()-1);
-            int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->width()-1);
-
-            SkScalar totalWeight = 0;
-            SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
-
-            for (int srcX = x0 ; srcX <= x1 ; srcX++) {
-                SkScalar weight = filter->lookupScalar(sx - srcX);
-                SkPMColor c = *src->getAddr32(srcX, x);
-                fr += SkScalarMul(weight,SkGetPackedR32(c));
-                fg += SkScalarMul(weight,SkGetPackedG32(c));
-                fb += SkScalarMul(weight,SkGetPackedB32(c));
-                fa += SkScalarMul(weight,SkGetPackedA32(c));
-                totalWeight += weight;
-            }
-            fr = SkScalarDiv(fr,totalWeight);
-            fg = SkScalarDiv(fg,totalWeight);
-            fb = SkScalarDiv(fb,totalWeight);
-            fa = SkScalarDiv(fa,totalWeight);
-
-            int a = SkClampMax(SkScalarRoundToInt(fa), 255);
-            int r = SkClampMax(SkScalarRoundToInt(fr), a);
-            int g = SkClampMax(SkScalarRoundToInt(fg), a);
-            int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
-            *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
-        }
-    }
-}
-
-static void downScaleHoriz(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
-    SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
-    SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
-
-    SkAutoTDeleteArray<SkScalar> ada1(sums);
-    SkAutoTDeleteArray<SkScalar> ada2(weights);
-
-    memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
-    memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
-
-    for (int y = 0 ; y < src->height() ; y++) {
-        for (int x = 0 ; x < src->width() ; x++) {
-            // splat each source pixel into the destination image
-            float dx = (x + 0.5f) * scale - 0.5f;
-            int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->width()-1);
-            int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->width()-1);
-
-            SkPMColor c = *src->getAddr32(x,y);
-
-            for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {
-                SkScalar weight = filter->lookup(dx - dst_x);
-                sums[4*(y*dst->width() + dst_x) + 0] += weight*SkGetPackedR32(c);
-                sums[4*(y*dst->width() + dst_x) + 1] += weight*SkGetPackedG32(c);
-                sums[4*(y*dst->width() + dst_x) + 2] += weight*SkGetPackedB32(c);
-                sums[4*(y*dst->width() + dst_x) + 3] += weight*SkGetPackedA32(c);
-                weights[y*dst->width() + dst_x] += weight;
-            }
-        }
-    }
-
-    divideByWeights(sums, weights, dst);
-}
-
-static void downScaleVert(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
-    SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
-    SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
-
-    SkAutoTDeleteArray<SkScalar> ada1(sums);
-    SkAutoTDeleteArray<SkScalar> ada2(weights);
-
-    memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
-    memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
-
-    for (int y = 0 ; y < src->height() ; y++) {
-        for (int x = 0 ; x < src->width() ; x++) {
-            // splat each source pixel into the destination image
-            float dy = (y + 0.5f) * scale - 0.5f;
-            int y0 = SkClampMax(sk_float_ceil2int(dy-filter->width()), dst->height()-1);
-            int y1 = SkClampMax(sk_float_ceil2int(dy+filter->width()), dst->height()-1);
-
-            SkPMColor c = *src->getAddr32(x,y);
-
-            for (int dst_y = y0 ; dst_y <= y1 ; dst_y++) {
-                SkScalar weight = filter->lookupScalar(dy - dst_y);
-                sums[4*(dst_y*dst->width() + x) + 0] += weight*SkGetPackedR32(c);
-                sums[4*(dst_y*dst->width() + x) + 1] += weight*SkGetPackedG32(c);
-                sums[4*(dst_y*dst->width() + x) + 2] += weight*SkGetPackedB32(c);
-                sums[4*(dst_y*dst->width() + x) + 3] += weight*SkGetPackedA32(c);
-                weights[dst_y*dst->width() + x] += weight;
-            }
-        }
-    }
-
-    divideByWeights(sums, weights, dst);
-}
-
-void SkBitmap::scale(SkBitmap *dst) const {
-
-    SkBitmap horizTemp;
-
-    horizTemp.setConfig(SkBitmap::kARGB_8888_Config, height(), dst->width());
-    horizTemp.allocPixels();
-
-    SkBitmapFilter *filter = allocateBitmapFilter();
-
-    float horizScale = float(dst->width()) / width();
-
-    if (horizScale >= 1) {
-        upScaleHorizTranspose(this, &horizTemp, horizScale, filter);
-    } else if (horizScale < 1) {
-        downScaleHoriz(this, &horizTemp, horizScale, filter);
-    }
-
-    float vertScale = float(dst->height()) / height();
-
-    if (vertScale >= 1) {
-        upScaleHorizTranspose(&horizTemp, dst, vertScale, filter);
-    } else if (vertScale < 1) {
-        downScaleVert(&horizTemp, dst, vertScale, filter);
-    }
-
-    SkDELETE(filter);
-}
diff --git a/src/core/SkBitmapFilter.h b/src/core/SkBitmapFilter.h
index 38c2448c69..6a9e3d7c01 100644
--- a/src/core/SkBitmapFilter.h
+++ b/src/core/SkBitmapFilter.h
@@ -26,28 +26,30 @@ class SkBitmapFilter {
           fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1);
       }
 
-      SkFixed lookup( float x ) const {
+      SkFixed lookup(float x) const {
           if (!fPrecomputed) {
               precomputeTable();
           }
           int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
           SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
-          return fFilterTable[ filter_idx ];
+          return fFilterTable[filter_idx];
       }
 
-      SkScalar lookupScalar( float x ) const {
+      SkScalar lookupScalar(float x) const {
           if (!fPrecomputed) {
               precomputeTable();
           }
           int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
           SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
-          return fFilterTableScalar[ filter_idx ];
+          return fFilterTableScalar[filter_idx];
       }
 
       float width() const { return fWidth; }
       float invWidth() const { return fInvWidth; }
       virtual float evaluate(float x) const = 0;
       virtual ~SkBitmapFilter() {}
+      
+      static SkBitmapFilter* Allocate();
   protected:
       float fWidth;
       float fInvWidth;
@@ -126,29 +128,47 @@ class SkBoxFilter: public SkBitmapFilter {
       }
 
       virtual float evaluate(float x) const SK_OVERRIDE {
-          return 1;
+          return (x >= -fWidth && x < fWidth) ? 1.0f : 0.0f;
       }
   protected:
 };
 
+class SkHammingFilter: public SkBitmapFilter {
+public:
+    SkHammingFilter(float width=1.f)
+    : SkBitmapFilter(width) {
+    }
+    virtual float evaluate(float x) const SK_OVERRIDE {
+        if (x <= -fWidth || x >= fWidth) {
+            return 0.0f;  // Outside of the window.
+        }
+        if (x > -FLT_EPSILON && x < FLT_EPSILON) {
+            return 1.0f;  // Special case the sinc discontinuity at the origin.
+        }
+        const float xpi = x * static_cast<float>(M_PI);
+
+        return ((sk_float_sin(xpi) / xpi) *  // sinc(x)
+                (0.54f + 0.46f * sk_float_cos(xpi / fWidth)));  // hamming(x)
+    }
+};
 
-class SkSincFilter: public SkBitmapFilter {
+class SkLanczosFilter: public SkBitmapFilter {
   public:
-      SkSincFilter(float t, float width=3.f)
-      : SkBitmapFilter(width), tau(t) {
+      SkLanczosFilter(float width=3.f)
+      : SkBitmapFilter(width) {
       }
 
       virtual float evaluate(float x) const SK_OVERRIDE {
-          x = sk_float_abs(x * fInvWidth);
-          if (x < 1e-5f) return 1.f;
-          if (x > 1.f)   return 0.f;
-          x *= SK_ScalarPI;
-          float sinc = sk_float_sin(x) / x;
-          float lanczos = sk_float_sin(x * tau) / (x * tau);
-          return sinc * lanczos;
-      }
-  protected:
-      float tau;
+          if (x <= -fWidth || x >= fWidth) {
+              return 0.0f;  // Outside of the window.
+          }
+          if (x > -FLT_EPSILON && x < FLT_EPSILON) {              
+              return 1.0f;  // Special case the discontinuity at the origin.
+          }
+          float xpi = x * static_cast<float>(M_PI);
+          return (sk_float_sin(xpi) / xpi) *  // sinc(x)
+                  sk_float_sin(xpi / fWidth) / (xpi / fWidth);  // sinc(x/fWidth)
+      }      
 };
 
 
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index a8a9b03d9a..57af144034 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -11,6 +11,7 @@
 #include "SkPaint.h"
 #include "SkShader.h"   // for tilemodes
 #include "SkUtilsArm.h"
+#include "SkBitmapScaler.h"
 
 #if !SK_ARM_NEON_IS_NONE
 // These are defined in src/opts/SkBitmapProcState_arm_neon.cpp
@@ -99,23 +100,45 @@ void SkBitmapProcState::possiblyScaleImage() {
     if (fFilterQuality != kHQ_BitmapFilter) {
         return;
     }
-
-    // STEP 1: UPSAMPLE?
-
-    // Check to see if the transformation matrix is scaling up, and if
-    // the matrix is simple, and if we're doing high quality scaling.
-    // If so, do the bitmap scale here and remove the scaling component from the matrix.
-
-    if (fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
-        (fInvMatrix.getScaleX() < 1 || fInvMatrix.getScaleY() < 1) &&
+    
+    // see if our platform has any specialized convolution code.
+    
+    
+    // Set up a pointer to a local (instead of storing the structure in the
+    // proc state) to avoid introducing a header dependency; this makes 
+    // recompiles a lot less painful.
+    
+    SkConvolutionProcs simd;
+    fConvolutionProcs = &simd;
+    
+    fConvolutionProcs->fExtraHorizontalReads = 0;
+    fConvolutionProcs->fConvolveVertically = NULL;
+    fConvolutionProcs->fConvolve4RowsHorizontally = NULL;
+    fConvolutionProcs->fConvolveHorizontally = NULL;
+    fConvolutionProcs->fApplySIMDPadding = NULL;
+    
+    this->platformConvolutionProcs();
+
+    // STEP 1: Highest quality direct scale?
+
+    // Check to see if the transformation matrix is simple, and if we're 
+    // doing high quality scaling.  If so, do the bitmap scale here and 
+    // remove the scaling component from the matrix.
+
+    if (fFilterQuality == kHQ_BitmapFilter &&
+        fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
         fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) {
-
+            
+        int dest_width  = SkScalarCeilToInt(fOrigBitmap.width() / fInvMatrix.getScaleX());
+        int dest_height = SkScalarCeilToInt(fOrigBitmap.height() / fInvMatrix.getScaleY());
+        
         // All the criteria are met; let's make a new bitmap.
-        fScaledBitmap.setConfig(SkBitmap::kARGB_8888_Config,
-                                (int)(fOrigBitmap.width() / fInvMatrix.getScaleX()),
-                                (int)(fOrigBitmap.height() / fInvMatrix.getScaleY()));
-        fScaledBitmap.allocPixels();
-        fOrigBitmap.scale(&fScaledBitmap);
+
+        fScaledBitmap = SkBitmapScaler::Resize( fOrigBitmap, SkBitmapScaler::RESIZE_BEST,
+                                                dest_width, dest_height, fConvolutionProcs );
+            
+        fScaledBitmap.lockPixels();
+            
         fBitmap = &fScaledBitmap;
 
         // set the inv matrix type to translate-only;
@@ -130,9 +153,9 @@ void SkBitmapProcState::possiblyScaleImage() {
         return;
     }
 
-    if (!fOrigBitmap.hasMipMap()) {
+    if (!fOrigBitmap.hasMipMap() && fFilterQuality != kNone_BitmapFilter) {
 
-        // STEP 2: DOWNSAMPLE
+        // STEP 2: MIPMAP DOWNSAMPLE?
 
         // Check to see if the transformation matrix is scaling *down*.
         // If so, automatically build mipmaps.
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index a644dd1e02..3c8e346807 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -31,6 +31,7 @@
 #endif
 
 class SkPaint;
+class SkConvolutionProcs;
 
 struct SkBitmapProcState {
 
@@ -59,7 +60,7 @@ struct SkBitmapProcState {
                                  const uint32_t[],
                                  int count,
                                  uint16_t colors[]);
-
+                                 
     typedef U16CPU (*FixedTileProc)(SkFixed);   // returns 0..0xFFFF
     typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int);   // returns 0..0xF
     typedef U16CPU (*IntTileProc)(int value, int count);   // returns 0..count-1
@@ -78,6 +79,8 @@ struct SkBitmapProcState {
     IntTileProc         fIntTileProcY;      // chooseProcs
     SkFixed             fFilterOneX;
     SkFixed             fFilterOneY;
+    
+    SkConvolutionProcs* fConvolutionProcs;         // possiblyScaleImage
 
     SkPMColor           fPaintPMColor;      // chooseProcs - A8 config
     SkFixed             fInvSx;             // chooseProcs
@@ -113,7 +116,12 @@ struct SkBitmapProcState {
         implementation can do nothing (see SkBitmapProcState_opts_none.cpp)
      */
     void platformProcs();
-
+    
+    /** Platforms can also optionally overwrite the convolution functions
+        if we have SIMD versions of them.
+      */
+      
+    void platformConvolutionProcs();
 
     /** Given the byte size of the index buffer to be passed to the matrix proc,
         return the maximum number of resulting pixels that can be computed
@@ -160,7 +168,7 @@ private:
 
     void possiblyScaleImage();
 
-    SkBitmapFilter *fBitmapFilter;
+    SkBitmapFilter* fBitmapFilter;
 
     ShaderProc32 chooseBitmapFilterProc();
 
@@ -218,8 +226,6 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
 void S32_D16_filter_DX(const SkBitmapProcState& s,
                                    const uint32_t* xy, int count, uint16_t* colors);
 
-void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
-                             SkPMColor *SK_RESTRICT colors, int count);
 void highQualityFilter(const SkBitmapProcState &s, int x, int y,
                    SkPMColor *SK_RESTRICT colors, int count);
 
diff --git a/src/core/SkBitmapScaler.cpp b/src/core/SkBitmapScaler.cpp
new file mode 100644
index 0000000000..7e840d2fdb
--- /dev/null
+++ b/src/core/SkBitmapScaler.cpp
@@ -0,0 +1,315 @@
+#include "SkBitmapScaler.h"
+#include "SkBitmapFilter.h"
+#include "SkRect.h"
+#include "SkTArray.h"
+#include "SkErrorInternals.h"
+#include "SkConvolver.h"
+
+// SkResizeFilter ----------------------------------------------------------------
+
+// Encapsulates computation and storage of the filters required for one complete
+// resize operation.
+class SkResizeFilter {
+public:
+    SkResizeFilter(SkBitmapScaler::ResizeMethod method,
+                   int srcFullWidth, int srcFullHeight,
+                   int destWidth, int destHeight,
+                   const SkIRect& destSubset,
+                   SkConvolutionProcs* convolveProcs);
+    ~SkResizeFilter() { 
+        SkDELETE( fBitmapFilter ); 
+    }
+    
+    // Returns the filled filter values.
+    const SkConvolutionFilter1D& xFilter() { return fXFilter; }
+    const SkConvolutionFilter1D& yFilter() { return fYFilter; }
+
+private:
+    
+    SkBitmapFilter* fBitmapFilter;
+
+    // Computes one set of filters either horizontally or vertically. The caller
+    // will specify the "min" and "max" rather than the bottom/top and
+    // right/bottom so that the same code can be re-used in each dimension.
+    //
+    // |srcDependLo| and |srcDependSize| gives the range for the source
+    // depend rectangle (horizontally or vertically at the caller's discretion
+    // -- see above for what this means).
+    //
+    // Likewise, the range of destination values to compute and the scale factor
+    // for the transform is also specified.
+    
+    void computeFilters(int srcSize,
+                        int destSubsetLo, int destSubsetSize,
+                        float scale,
+                        SkConvolutionFilter1D* output,
+                        SkConvolutionProcs* convolveProcs);
+
+    // Subset of scaled destination bitmap to compute.
+    SkIRect fOutBounds;
+
+    SkConvolutionFilter1D fXFilter;
+    SkConvolutionFilter1D fYFilter;
+};
+
+SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method,
+                               int srcFullWidth, int srcFullHeight,
+                               int destWidth, int destHeight,
+                               const SkIRect& destSubset,
+                               SkConvolutionProcs* convolveProcs)
+                       : fOutBounds(destSubset) {
+    
+    // method will only ever refer to an "algorithm method".
+    SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+             (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
+
+    switch(method) {
+        case SkBitmapScaler::RESIZE_BOX:
+            fBitmapFilter = SkNEW(SkBoxFilter);
+            break;
+        case SkBitmapScaler::RESIZE_TRIANGLE:
+            fBitmapFilter = SkNEW(SkTriangleFilter);
+            break;
+        case SkBitmapScaler::RESIZE_MITCHELL:
+            fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
+            break;
+        case SkBitmapScaler::RESIZE_HAMMING:
+            fBitmapFilter = SkNEW(SkHammingFilter);
+            break;
+        case SkBitmapScaler::RESIZE_LANCZOS3:
+            fBitmapFilter = SkNEW(SkLanczosFilter);
+            break;
+        default:
+            // NOTREACHED:
+            fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
+            break;
+    }
+    
+
+    float scaleX = static_cast<float>(destWidth) /
+                   static_cast<float>(srcFullWidth);
+    float scaleY = static_cast<float>(destHeight) /
+                   static_cast<float>(srcFullHeight);
+
+    this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(),
+                         scaleX, &fXFilter, convolveProcs);
+    this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(),
+                         scaleY, &fYFilter, convolveProcs);
+}
+
+// TODO(egouriou): Take advantage of periods in the convolution.
+// Practical resizing filters are periodic outside of the border area.
+// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
+// source become p pixels in the destination) will have a period of p.
+// A nice consequence is a period of 1 when downscaling by an integral
+// factor. Downscaling from typical display resolutions is also bound
+// to produce interesting periods as those are chosen to have multiple
+// small factors.
+// Small periods reduce computational load and improve cache usage if
+// the coefficients can be shared. For periods of 1 we can consider
+// loading the factors only once outside the borders.
+void SkResizeFilter::computeFilters(int srcSize,
+                                  int destSubsetLo, int destSubsetSize,
+                                  float scale,
+                                  SkConvolutionFilter1D* output,
+                                  SkConvolutionProcs* convolveProcs) {
+  int destSubsetHi = destSubsetLo + destSubsetSize;  // [lo, hi)
+
+  // When we're doing a magnification, the scale will be larger than one. This
+  // means the destination pixels are much smaller than the source pixels, and
+  // that the range covered by the filter won't necessarily cover any source
+  // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+  // some computations.
+  float clampedScale = SkTMin(1.0f, scale);
+
+  // This is how many source pixels from the center we need to count
+  // to support the filtering function.
+  float srcSupport = fBitmapFilter->width() / clampedScale;
+
+  // Speed up the divisions below by turning them into multiplies.
+  float invScale = 1.0f / scale;
+
+  SkTArray<float> filterValues(64);
+  SkTArray<short> fixedFilterValues(64);
+
+  // Loop over all pixels in the output range. We will generate one set of
+  // filter values for each one. Those values will tell us how to blend the
+  // source pixels to compute the destination pixel.
+  for (int destSubsetI = destSubsetLo; destSubsetI < destSubsetHi;
+       destSubsetI++) {
+    // Reset the arrays. We don't declare them inside so they can re-use the
+    // same malloc-ed buffer.
+    filterValues.reset();
+    fixedFilterValues.reset();
+
+    // This is the pixel in the source directly under the pixel in the dest.
+    // Note that we base computations on the "center" of the pixels. To see
+    // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
+    // downscale should "cover" the pixels around the pixel with *its center*
+    // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
+    // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
+    float srcPixel = (static_cast<float>(destSubsetI) + 0.5f) * invScale;
+
+    // Compute the (inclusive) range of source pixels the filter covers.
+    int srcBegin = SkTMax(0, SkScalarFloorToInt(srcPixel - srcSupport));
+    int srcEnd = SkTMin(srcSize - 1, SkScalarCeilToInt(srcPixel + srcSupport));
+
+    // Compute the unnormalized filter value at each location of the source
+    // it covers.
+    float filterSum = 0.0f;  // Sub of the filter values for normalizing.
+    for (int curFilterPixel = srcBegin; curFilterPixel <= srcEnd;
+         curFilterPixel++) {
+      // Distance from the center of the filter, this is the filter coordinate
+      // in source space. We also need to consider the center of the pixel
+      // when comparing distance against 'srcPixel'. In the 5x downscale
+      // example used above the distance from the center of the filter to
+      // the pixel with coordinates (2, 2) should be 0, because its center
+      // is at (2.5, 2.5).
+      float srcFilterDist =
+          ((static_cast<float>(curFilterPixel) + 0.5f) - srcPixel);
+
+      // Since the filter really exists in dest space, map it there.
+      float destFilterDist = srcFilterDist * clampedScale;
+
+      // Compute the filter value at that location.
+      float filterValue = fBitmapFilter->evaluate(destFilterDist);
+      filterValues.push_back(filterValue);
+
+      filterSum += filterValue;
+    }
+    SkASSERT(!filterValues.empty());
+
+    // The filter must be normalized so that we don't affect the brightness of
+    // the image. Convert to normalized fixed point.
+    short fixedSum = 0;
+    for (int i = 0; i < filterValues.count(); i++) {
+      short curFixed = output->FloatToFixed(filterValues[i] / filterSum);
+      fixedSum += curFixed;
+      fixedFilterValues.push_back(curFixed);
+    }
+
+    // The conversion to fixed point will leave some rounding errors, which
+    // we add back in to avoid affecting the brightness of the image. We
+    // arbitrarily add this to the center of the filter array (this won't always
+    // be the center of the filter function since it could get clipped on the
+    // edges, but it doesn't matter enough to worry about that case).
+    short leftovers = output->FloatToFixed(1.0f) - fixedSum;
+    fixedFilterValues[fixedFilterValues.count() / 2] += leftovers;
+
+    // Now it's ready to go.
+    output->AddFilter(srcBegin, &fixedFilterValues[0],
+                      static_cast<int>(fixedFilterValues.count()));
+  }
+
+  if (convolveProcs->fApplySIMDPadding) {
+      convolveProcs->fApplySIMDPadding( output );
+  }
+}
+
+static SkBitmapScaler::ResizeMethod ResizeMethodToAlgorithmMethod(
+                                    SkBitmapScaler::ResizeMethod method) {
+    // Convert any "Quality Method" into an "Algorithm Method"
+    if (method >= SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD &&
+    method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD) {
+        return method;
+    }
+    // The call to SkBitmapScalerGtv::Resize() above took care of
+    // GPU-acceleration in the cases where it is possible. So now we just
+    // pick the appropriate software method for each resize quality.
+    switch (method) {
+        // Users of RESIZE_GOOD are willing to trade a lot of quality to
+        // get speed, allowing the use of linear resampling to get hardware
+        // acceleration (SRB). Hence any of our "good" software filters
+        // will be acceptable, so we use a triangle.
+        case SkBitmapScaler::RESIZE_GOOD:
+            return SkBitmapScaler::RESIZE_TRIANGLE;
+        // Users of RESIZE_BETTER are willing to trade some quality in order
+        // to improve performance, but are guaranteed not to devolve to a linear
+        // resampling. In visual tests we see that Hamming-1 is not as good as
+        // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
+        // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
+        // an acceptable trade-off between quality and speed.
+        case SkBitmapScaler::RESIZE_BETTER:
+            return SkBitmapScaler::RESIZE_HAMMING;
+        default:
+            return SkBitmapScaler::RESIZE_MITCHELL;
+    }
+}
+
+// static
+SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
+                                ResizeMethod method,
+                                int destWidth, int destHeight,
+                                const SkIRect& destSubset,
+                                SkConvolutionProcs* convolveProcs,
+                                SkBitmap::Allocator* allocator) {
+  // Ensure that the ResizeMethod enumeration is sound.
+    SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
+        (method <= RESIZE_LAST_QUALITY_METHOD)) ||
+        ((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+        (method <= RESIZE_LAST_ALGORITHM_METHOD)));
+
+    SkIRect dest = { 0, 0, destWidth, destHeight };
+    if (!dest.contains(destSubset)) {
+        SkErrorInternals::SetError( kInvalidArgument_SkError,
+                                    "Sorry, you passed me a bitmap resize "
+                                    " method I have never heard of: %d",
+                                    method );
+    }
+
+    // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
+    // return empty.
+    if (source.width() < 1 || source.height() < 1 ||
+        destWidth < 1 || destHeight < 1) {
+        return SkBitmap();
+    }
+
+    method = ResizeMethodToAlgorithmMethod(method);
+
+    // Check that we deal with an "algorithm methods" from this point onward.
+    SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+        (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
+
+    SkAutoLockPixels locker(source);
+    if (!source.readyToDraw() || source.config() != SkBitmap::kARGB_8888_Config)
+        return SkBitmap();
+
+    SkResizeFilter filter(method, source.width(), source.height(),
+                          destWidth, destHeight, destSubset, convolveProcs);
+
+    // Get a source bitmap encompassing this touched area. We construct the
+    // offsets and row strides such that it looks like a new bitmap, while
+    // referring to the old data.
+    const unsigned char* sourceSubset =
+        reinterpret_cast<const unsigned char*>(source.getPixels());
+
+    // Convolve into the result.
+    SkBitmap result;
+    result.setConfig(SkBitmap::kARGB_8888_Config,
+        destSubset.width(), destSubset.height());
+    result.allocPixels(allocator, NULL);
+    if (!result.readyToDraw())
+        return SkBitmap();
+
+    BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()),
+        !source.isOpaque(), filter.xFilter(), filter.yFilter(),
+        static_cast<int>(result.rowBytes()),
+        static_cast<unsigned char*>(result.getPixels()),
+        convolveProcs, true);
+
+    // Preserve the "opaque" flag for use as an optimization later.
+    result.setIsOpaque(source.isOpaque());
+
+    return result;
+}
+
+// static
+SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
+                                ResizeMethod method,
+                                int destWidth, int destHeight,
+                                SkConvolutionProcs* convolveProcs,
+                                SkBitmap::Allocator* allocator) {
+    SkIRect destSubset = { 0, 0, destWidth, destHeight };
+    return Resize(source, method, destWidth, destHeight, destSubset,
+                  convolveProcs, allocator);
+}
diff --git a/src/core/SkBitmapScaler.h b/src/core/SkBitmapScaler.h
new file mode 100644
index 0000000000..5682cc578d
--- /dev/null
+++ b/src/core/SkBitmapScaler.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+ 
+#ifndef SkBitmapScaler_DEFINED
+#define SkBitmapScaler_DEFINED
+
+#include "SkBitmap.h"
+#include "SkConvolver.h"
+ 
+/** \class SkBitmapScaler
+
+    Provides the interface for high quality image resampling.
+ */
+    
+class SK_API SkBitmapScaler {
+public:    
+    enum ResizeMethod {
+        // Quality Methods
+        //
+        // Those enumeration values express a desired quality/speed tradeoff.
+        // They are translated into an algorithm-specific method that depends
+        // on the capabilities (CPU, GPU) of the underlying platform.
+        // It is possible for all three methods to be mapped to the same
+        // algorithm on a given platform.
+        
+        // Good quality resizing. Fastest resizing with acceptable visual quality.
+        // This is typically intended for use during interactive layouts
+        // where slower platforms may want to trade image quality for large
+        // increase in resizing performance.
+        //
+        // For example the resizing implementation may devolve to linear
+        // filtering if this enables GPU acceleration to be used.
+        //
+        // Note that the underlying resizing method may be determined
+        // on the fly based on the parameters for a given resize call.
+        // For example an implementation using a GPU-based linear filter
+        // in the common case may still use a higher-quality software-based
+        // filter in cases where using the GPU would actually be slower - due
+        // to too much latency - or impossible - due to image format or size
+        // constraints.
+        RESIZE_GOOD,
+
+        // Medium quality resizing. Close to high quality resizing (better
+        // than linear interpolation) with potentially some quality being
+        // traded-off for additional speed compared to RESIZE_BEST.
+        //
+        // This is intended, for example, for generation of large thumbnails
+        // (hundreds of pixels in each dimension) from large sources, where
+        // a linear filter would produce too many artifacts but where
+        // a RESIZE_HIGH might be too costly time-wise.
+        RESIZE_BETTER,
+
+        // High quality resizing. The algorithm is picked to favor image quality.
+        RESIZE_BEST,
+        
+        //
+        // Algorithm-specific enumerations
+        //
+        
+        // Box filter. This is a weighted average of all of the pixels touching
+        // the destination pixel. For enlargement, this is nearest neighbor.
+        //
+        // You probably don't want this, it is here for testing since it is easy to
+        // compute. Use RESIZE_LANCZOS3 instead.
+        RESIZE_BOX,
+        RESIZE_TRIANGLE,
+        RESIZE_LANCZOS3,
+        RESIZE_HAMMING,
+        RESIZE_MITCHELL,
+        
+        // enum aliases for first and last methods by algorithm or by quality.
+        RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
+        RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
+        RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
+        RESIZE_LAST_ALGORITHM_METHOD = RESIZE_MITCHELL,
+    };
+    
+    // Resizes the given source bitmap using the specified resize method, so that
+    // the entire image is (dest_size) big. The dest_subset is the rectangle in
+    // this destination image that should actually be returned.
+    //
+    // The output image will be (dest_subset.width(), dest_subset.height()). This
+    // will save work if you do not need the entire bitmap.
+    //
+    // The destination subset must be smaller than the destination image.
+    static SkBitmap Resize(const SkBitmap& source,
+                           ResizeMethod method,
+                           int dest_width, int dest_height,
+                           const SkIRect& dest_subset,
+                           SkConvolutionProcs *convolveProcs = NULL,
+                           SkBitmap::Allocator* allocator = NULL);
+
+    // Alternate version for resizing and returning the entire bitmap rather than
+    // a subset.
+    static SkBitmap Resize(const SkBitmap& source,
+                           ResizeMethod method,
+                           int dest_width, int dest_height,
+                           SkConvolutionProcs *convolveProcs = NULL,
+                           SkBitmap::Allocator* allocator = NULL);
+};
+
+#endif
diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp
new file mode 100644
index 0000000000..54e46b63fe
--- /dev/null
+++ b/src/core/SkConvolver.cpp
@@ -0,0 +1,473 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "SkConvolver.h"
+#include "SkSize.h"
+#include "SkTypes.h"
+
+namespace {
+
+    // Converts the argument to an 8-bit unsigned value by clamping to the range
+    // 0-255.
+    inline unsigned char ClampTo8(int a) {
+        if (static_cast<unsigned>(a) < 256) {
+            return a;  // Avoid the extra check in the common case.
+        }
+        if (a < 0) {
+            return 0;
+        }
+        return 255;
+    }
+
+    // Takes the value produced by accumulating element-wise product of image with
+    // a kernel and brings it back into range.
+    // All of the filter scaling factors are in fixed point with kShiftBits bits of
+    // fractional part.
+    inline unsigned char BringBackTo8(int a, bool takeAbsolute) {
+        a >>= SkConvolutionFilter1D::kShiftBits;
+        if (takeAbsolute) {
+            a = abs(a);
+        }
+        return ClampTo8(a);
+    }
+
+    // Stores a list of rows in a circular buffer. The usage is you write into it
+    // by calling AdvanceRow. It will keep track of which row in the buffer it
+    // should use next, and the total number of rows added.
+    class CircularRowBuffer {
+    public:
+        // The number of pixels in each row is given in |sourceRowPixelWidth|.
+        // The maximum number of rows needed in the buffer is |maxYFilterSize|
+        // (we only need to store enough rows for the biggest filter).
+        //
+        // We use the |firstInputRow| to compute the coordinates of all of the
+        // following rows returned by Advance().
+        CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize,
+                          int firstInputRow)
+            : fRowByteWidth(destRowPixelWidth * 4),
+              fNumRows(maxYFilterSize),
+              fNextRow(0),
+              fNextRowCoordinate(firstInputRow) {
+            fBuffer.reset(fRowByteWidth * maxYFilterSize);
+            fRowAddresses.reset(fNumRows);
+        }
+
+        // Moves to the next row in the buffer, returning a pointer to the beginning
+        // of it.
+        unsigned char* advanceRow() {
+            unsigned char* row = &fBuffer[fNextRow * fRowByteWidth];
+            fNextRowCoordinate++;
+
+            // Set the pointer to the next row to use, wrapping around if necessary.
+            fNextRow++;
+            if (fNextRow == fNumRows) {
+                fNextRow = 0;
+            }
+            return row;
+        }
+
+        // Returns a pointer to an "unrolled" array of rows. These rows will start
+        // at the y coordinate placed into |*firstRowIndex| and will continue in
+        // order for the maximum number of rows in this circular buffer.
+        //
+        // The |firstRowIndex_| may be negative. This means the circular buffer
+        // starts before the top of the image (it hasn't been filled yet).
+        unsigned char* const* GetRowAddresses(int* firstRowIndex) {
+            // Example for a 4-element circular buffer holding coords 6-9.
+            //   Row 0   Coord 8
+            //   Row 1   Coord 9
+            //   Row 2   Coord 6  <- fNextRow = 2, fNextRowCoordinate = 10.
+            //   Row 3   Coord 7
+            //
+            // The "next" row is also the first (lowest) coordinate. This computation
+            // may yield a negative value, but that's OK, the math will work out
+            // since the user of this buffer will compute the offset relative
+            // to the firstRowIndex and the negative rows will never be used.
+            *firstRowIndex = fNextRowCoordinate - fNumRows;
+
+            int curRow = fNextRow;
+            for (int i = 0; i < fNumRows; i++) {
+                fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth];
+
+                // Advance to the next row, wrapping if necessary.
+                curRow++;
+                if (curRow == fNumRows) {
+                    curRow = 0;
+                }
+            }
+            return &fRowAddresses[0];
+        }
+
+    private:
+        // The buffer storing the rows. They are packed, each one fRowByteWidth.
+        SkTArray<unsigned char> fBuffer;
+
+        // Number of bytes per row in the |buffer|.
+        int fRowByteWidth;
+
+        // The number of rows available in the buffer.
+        int fNumRows;
+
+        // The next row index we should write into. This wraps around as the
+        // circular buffer is used.
+        int fNextRow;
+
+        // The y coordinate of the |fNextRow|. This is incremented each time a
+        // new row is appended and does not wrap.
+        int fNextRowCoordinate;
+
+        // Buffer used by GetRowAddresses().
+        SkTArray<unsigned char*> fRowAddresses;
+    };
+
+// Convolves horizontally along a single row. The row data is given in
+// |srcData| and continues for the numValues() of the filter.
+template<bool hasAlpha>
+    void ConvolveHorizontally(const unsigned char* srcData,
+                              const SkConvolutionFilter1D& filter,
+                              unsigned char* outRow) {
+        // Loop over each pixel on this row in the output image.
+        int numValues = filter.numValues();
+        for (int outX = 0; outX < numValues; outX++) {
+            // Get the filter that determines the current output pixel.
+            int filterOffset, filterLength;
+            const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
+                filter.FilterForValue(outX, &filterOffset, &filterLength);
+
+            // Compute the first pixel in this row that the filter affects. It will
+            // touch |filterLength| pixels (4 bytes each) after this.
+            const unsigned char* rowToFilter = &srcData[filterOffset * 4];
+
+            // Apply the filter to the row to get the destination pixel in |accum|.
+            int accum[4] = {0};
+            for (int filterX = 0; filterX < filterLength; filterX++) {
+                SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
+                accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
+                accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
+                accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
+                if (hasAlpha) {
+                    accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
+                }
+            }
+
+            // Bring this value back in range. All of the filter scaling factors
+            // are in fixed point with kShiftBits bits of fractional part.
+            accum[0] >>= SkConvolutionFilter1D::kShiftBits;
+            accum[1] >>= SkConvolutionFilter1D::kShiftBits;
+            accum[2] >>= SkConvolutionFilter1D::kShiftBits;
+            if (hasAlpha) {
+                accum[3] >>= SkConvolutionFilter1D::kShiftBits;
+            }
+
+            // Store the new pixel.
+            outRow[outX * 4 + 0] = ClampTo8(accum[0]);
+            outRow[outX * 4 + 1] = ClampTo8(accum[1]);
+            outRow[outX * 4 + 2] = ClampTo8(accum[2]);
+            if (hasAlpha) {
+                outRow[outX * 4 + 3] = ClampTo8(accum[3]);
+            }
+        }
+    }
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |sourceDataRows| array, with each row
+// being |pixelWidth| wide.
+//
+// The output must have room for |pixelWidth * 4| bytes.
+template<bool hasAlpha>
+    void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+                            int filterLength,
+                            unsigned char* const* sourceDataRows,
+                            int pixelWidth,
+                            unsigned char* outRow) {
+        // We go through each column in the output and do a vertical convolution,
+        // generating one output pixel each time.
+        for (int outX = 0; outX < pixelWidth; outX++) {
+            // Compute the number of bytes over in each row that the current column
+            // we're convolving starts at. The pixel will cover the next 4 bytes.
+            int byteOffset = outX * 4;
+
+            // Apply the filter to one column of pixels.
+            int accum[4] = {0};
+            for (int filterY = 0; filterY < filterLength; filterY++) {
+                SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
+                accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
+                accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
+                accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
+                if (hasAlpha) {
+                    accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
+                }
+            }
+
+            // Bring this value back in range. All of the filter scaling factors
+            // are in fixed point with kShiftBits bits of precision.
+            accum[0] >>= SkConvolutionFilter1D::kShiftBits;
+            accum[1] >>= SkConvolutionFilter1D::kShiftBits;
+            accum[2] >>= SkConvolutionFilter1D::kShiftBits;
+            if (hasAlpha) {
+                accum[3] >>= SkConvolutionFilter1D::kShiftBits;
+            }
+
+            // Store the new pixel.
+            outRow[byteOffset + 0] = ClampTo8(accum[0]);
+            outRow[byteOffset + 1] = ClampTo8(accum[1]);
+            outRow[byteOffset + 2] = ClampTo8(accum[2]);
+            if (hasAlpha) {
+                unsigned char alpha = ClampTo8(accum[3]);
+
+                // Make sure the alpha channel doesn't come out smaller than any of the
+                // color channels. We use premultipled alpha channels, so this should
+                // never happen, but rounding errors will cause this from time to time.
+                // These "impossible" colors will cause overflows (and hence random pixel
+                // values) when the resulting bitmap is drawn to the screen.
+                //
+                // We only need to do this when generating the final output row (here).
+                int maxColorChannel = SkTMax(outRow[byteOffset + 0],
+                                               SkTMax(outRow[byteOffset + 1], 
+                                                      outRow[byteOffset + 2]));
+                if (alpha < maxColorChannel) {
+                    outRow[byteOffset + 3] = maxColorChannel;
+                } else {
+                    outRow[byteOffset + 3] = alpha;
+                }
+            } else {
+                // No alpha channel, the image is opaque.
+                outRow[byteOffset + 3] = 0xff;
+            }
+        }
+    }
+
+    void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+                            int filterLength,
+                            unsigned char* const* sourceDataRows,
+                            int pixelWidth,
+                            unsigned char* outRow,
+                            bool sourceHasAlpha) {
+        if (sourceHasAlpha) {
+            ConvolveVertically<true>(filterValues, filterLength,
+                                     sourceDataRows, pixelWidth,
+                                     outRow);
+        } else {
+            ConvolveVertically<false>(filterValues, filterLength,
+                                      sourceDataRows, pixelWidth,
+                                      outRow);
+        }
+    }
+
+}  // namespace
+
+// SkConvolutionFilter1D ---------------------------------------------------------
+
+SkConvolutionFilter1D::SkConvolutionFilter1D()
+: fMaxFilter(0) {
+}
+
+SkConvolutionFilter1D::~SkConvolutionFilter1D() {
+}
+
+void SkConvolutionFilter1D::AddFilter(int filterOffset,
+                                      const float* filterValues,
+                                      int filterLength) {
+    SkASSERT(filterLength > 0);
+
+    SkTArray<ConvolutionFixed> fixedValues;
+    fixedValues.reset(filterLength);
+
+    for (int i = 0; i < filterLength; ++i) {
+        fixedValues.push_back(FloatToFixed(filterValues[i]));
+    }
+
+    AddFilter(filterOffset, &fixedValues[0], filterLength);
+}
+
+void SkConvolutionFilter1D::AddFilter(int filterOffset,
+                                      const ConvolutionFixed* filterValues,
+                                      int filterLength) {
+    // It is common for leading/trailing filter values to be zeros. In such
+    // cases it is beneficial to only store the central factors.
+    // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
+    // a 1080p image this optimization gives a ~10% speed improvement.
+    int filterSize = filterLength;
+    int firstNonZero = 0;
+    while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) {
+        firstNonZero++;
+    }
+
+    if (firstNonZero < filterLength) {
+        // Here we have at least one non-zero factor.
+        int lastNonZero = filterLength - 1;
+        while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) {
+            lastNonZero--;
+        }
+
+        filterOffset += firstNonZero;
+        filterLength = lastNonZero + 1 - firstNonZero;
+        SkASSERT(filterLength > 0);
+
+        for (int i = firstNonZero; i <= lastNonZero; i++) {
+            fFilterValues.push_back(filterValues[i]);
+        }
+    } else {
+        // Here all the factors were zeroes.
+        filterLength = 0;
+    }
+
+    FilterInstance instance;
+
+    // We pushed filterLength elements onto fFilterValues
+    instance.fDataLocation = (static_cast<int>(fFilterValues.count()) -
+                                               filterLength);
+    instance.fOffset = filterOffset;
+    instance.fTrimmedLength = filterLength;
+    instance.fLength = filterSize;
+    fFilters.push_back(instance);
+
+    fMaxFilter = SkTMax(fMaxFilter, filterLength);
+}
+
+const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter(
+                                        int* specifiedFilterlength,
+                                        int* filterOffset,
+                                        int* filterLength) const {
+    const FilterInstance& filter = fFilters[0];
+    *filterOffset = filter.fOffset;
+    *filterLength = filter.fTrimmedLength;
+    *specifiedFilterlength = filter.fLength;
+    if (filter.fTrimmedLength == 0) {
+        return NULL;
+    }
+
+    return &fFilterValues[filter.fDataLocation];
+}
+
+void BGRAConvolve2D(const unsigned char* sourceData,
+                    int sourceByteRowStride,
+                    bool sourceHasAlpha,
+                    const SkConvolutionFilter1D& filterX,
+                    const SkConvolutionFilter1D& filterY,
+                    int outputByteRowStride,
+                    unsigned char* output,
+                    SkConvolutionProcs* convolveProcs,
+                    bool useSimdIfPossible) {
+
+    int maxYFilterSize = filterY.maxFilter();
+
+    // The next row in the input that we will generate a horizontally
+    // convolved row for. If the filter doesn't start at the beginning of the
+    // image (this is the case when we are only resizing a subset), then we
+    // don't want to generate any output rows before that. Compute the starting
+    // row for convolution as the first pixel for the first vertical filter.
+    int filterOffset, filterLength;
+    const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
+        filterY.FilterForValue(0, &filterOffset, &filterLength);
+    int nextXRow = filterOffset;
+
+    // We loop over each row in the input doing a horizontal convolution. This
+    // will result in a horizontally convolved image. We write the results into
+    // a circular buffer of convolved rows and do vertical convolution as rows
+    // are available. This prevents us from having to store the entire
+    // intermediate image and helps cache coherency.
+    // We will need four extra rows to allow horizontal convolution could be done
+    // simultaneously. We also pad each row in row buffer to be aligned-up to
+    // 16 bytes.
+    // TODO(jiesun): We do not use aligned load from row buffer in vertical
+    // convolution pass yet. Somehow Windows does not like it.
+    int rowBufferWidth = (filterX.numValues() + 15) & ~0xF;
+    int rowBufferHeight = maxYFilterSize +
+                          (convolveProcs->fConvolve4RowsHorizontally ? 4 : 0);
+    CircularRowBuffer rowBuffer(rowBufferWidth,
+                                rowBufferHeight,
+                                filterOffset);
+
+    // Loop over every possible output row, processing just enough horizontal
+    // convolutions to run each subsequent vertical convolution.
+    SkASSERT(outputByteRowStride >= filterX.numValues() * 4);
+    int numOutputRows = filterY.numValues();
+
+    // We need to check which is the last line to convolve before we advance 4
+    // lines in one iteration.
+    int lastFilterOffset, lastFilterLength;
+
+    // SSE2 can access up to 3 extra pixels past the end of the
+    // buffer. At the bottom of the image, we have to be careful
+    // not to access data past the end of the buffer. Normally
+    // we fall back to the C++ implementation for the last row.
+    // If the last row is less than 3 pixels wide, we may have to fall
+    // back to the C++ version for more rows. Compute how many
+    // rows we need to avoid the SSE implementation for here.
+    filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset,
+                           &lastFilterLength);
+    int avoidSimdRows = 1 + convolveProcs->fExtraHorizontalReads /
+        (lastFilterOffset + lastFilterLength);
+
+    filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
+                           &lastFilterLength);
+
+    for (int outY = 0; outY < numOutputRows; outY++) {
+        filterValues = filterY.FilterForValue(outY,
+                                              &filterOffset, &filterLength);
+
+        // Generate output rows until we have enough to run the current filter.
+        while (nextXRow < filterOffset + filterLength) {
+            if (convolveProcs->fConvolve4RowsHorizontally &&
+                nextXRow + 3 < lastFilterOffset + lastFilterLength -
+                avoidSimdRows) {
+                const unsigned char* src[4];
+                unsigned char* outRow[4];
+                for (int i = 0; i < 4; ++i) {
+                    src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride];
+                    outRow[i] = rowBuffer.advanceRow();
+                }
+                convolveProcs->fConvolve4RowsHorizontally(src, filterX, outRow);
+                nextXRow += 4;
+            } else {
+                // Check if we need to avoid SSE2 for this row.
+                if (convolveProcs->fConvolveHorizontally &&
+                    nextXRow < lastFilterOffset + lastFilterLength -
+                    avoidSimdRows) {
+                    convolveProcs->fConvolveHorizontally(
+                        &sourceData[nextXRow * sourceByteRowStride],
+                        filterX, rowBuffer.advanceRow(), sourceHasAlpha);
+                } else {
+                    if (sourceHasAlpha) {
+                        ConvolveHorizontally<true>(
+                            &sourceData[nextXRow * sourceByteRowStride],
+                            filterX, rowBuffer.advanceRow());
+                    } else {
+                        ConvolveHorizontally<false>(
+                            &sourceData[nextXRow * sourceByteRowStride],
+                            filterX, rowBuffer.advanceRow());
+                    }
+                }
+                nextXRow++;
+            }
+        }
+
+        // Compute where in the output image this row of final data will go.
+        unsigned char* curOutputRow = &output[outY * outputByteRowStride];
+
+        // Get the list of rows that the circular buffer has, in order.
+        int firstRowInCircularBuffer;
+        unsigned char* const* rowsToConvolve =
+            rowBuffer.GetRowAddresses(&firstRowInCircularBuffer);
+
+        // Now compute the start of the subset of those rows that the filter
+        // needs.
+        unsigned char* const* firstRowForFilter =
+            &rowsToConvolve[filterOffset - firstRowInCircularBuffer];
+
+        if (convolveProcs->fConvolveVertically) {
+            convolveProcs->fConvolveVertically(filterValues, filterLength,
+                                               firstRowForFilter,
+                                               filterX.numValues(), curOutputRow,
+                                               sourceHasAlpha);
+        } else {
+            ConvolveVertically(filterValues, filterLength,
+                               firstRowForFilter,
+                               filterX.numValues(), curOutputRow,
+                               sourceHasAlpha);
+        }
+    }
+}
diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h
new file mode 100644
index 0000000000..a2758e57a8
--- /dev/null
+++ b/src/core/SkConvolver.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SK_CONVOLVER_H
+#define SK_CONVOLVER_H
+
+#include "SkSize.h"
+#include "SkTypes.h"
+#include "SkTArray.h"
+
+// avoid confusion with Mac OS X's math library (Carbon)
+#if defined(__APPLE__)
+#undef FloatToConvolutionFixed
+#undef ConvolutionFixedToFloat
+#endif
+
+// Represents a filter in one dimension. Each output pixel has one entry in this
+// object for the filter values contributing to it. You build up the filter
+// list by calling AddFilter for each output pixel (in order).
+//
+// We do 2-dimensional convolution by first convolving each row by one
+// SkConvolutionFilter1D, then convolving each column by another one.
+//
+// Entries are stored in ConvolutionFixed point, shifted left by kShiftBits.
+class SkConvolutionFilter1D {
+public:
+    typedef short ConvolutionFixed;
+
+    // The number of bits that ConvolutionFixed point values are shifted by.
+    enum { kShiftBits = 14 };
+
+    SK_API SkConvolutionFilter1D();
+    SK_API ~SkConvolutionFilter1D();
+
+    // Convert between floating point and our ConvolutionFixed point representation.
+    static ConvolutionFixed FloatToFixed(float f) {
+        return static_cast<ConvolutionFixed>(f * (1 << kShiftBits));
+    }
+    static unsigned char FixedToChar(ConvolutionFixed x) {
+        return static_cast<unsigned char>(x >> kShiftBits);
+    }
+    static float FixedToFloat(ConvolutionFixed x) {
+        // The cast relies on ConvolutionFixed being a short, implying that on
+        // the platforms we care about all (16) bits will fit into
+        // the mantissa of a (32-bit) float.
+        SK_COMPILE_ASSERT(sizeof(ConvolutionFixed) == 2, ConvolutionFixed_type_should_fit_in_float_mantissa);
+        float raw = static_cast<float>(x);
+        return ldexpf(raw, -kShiftBits);
+    }
+
+    // Returns the maximum pixel span of a filter.
+    int maxFilter() const { return fMaxFilter; }
+
+    // Returns the number of filters in this filter. This is the dimension of the
+    // output image.
+    int numValues() const { return static_cast<int>(fFilters.count()); }
+
+    // Appends the given list of scaling values for generating a given output
+    // pixel. |filterOffset| is the distance from the edge of the image to where
+    // the scaling factors start. The scaling factors apply to the source pixels
+    // starting from this position, and going for the next |filterLength| pixels.
+    //
+    // You will probably want to make sure your input is normalized (that is,
+    // all entries in |filterValuesg| sub to one) to prevent affecting the overall
+    // brighness of the image.
+    //
+    // The filterLength must be > 0.
+    //
+    // This version will automatically convert your input to ConvolutionFixed point.
+    SK_API void AddFilter(int filterOffset,
+                          const float* filterValues,
+                          int filterLength);
+
+    // Same as the above version, but the input is already ConvolutionFixed point.
+    void AddFilter(int filterOffset,
+                   const ConvolutionFixed* filterValues,
+                   int filterLength);
+
+    // Retrieves a filter for the given |valueOffset|, a position in the output
+    // image in the direction we're convolving. The offset and length of the
+    // filter values are put into the corresponding out arguments (see AddFilter
+    // above for what these mean), and a pointer to the first scaling factor is
+    // returned. There will be |filterLength| values in this array.
+    inline const ConvolutionFixed* FilterForValue(int valueOffset, 
+                                       int* filterOffset,
+                                       int* filterLength) const {
+        const FilterInstance& filter = fFilters[valueOffset];
+        *filterOffset = filter.fOffset;
+        *filterLength = filter.fTrimmedLength;
+        if (filter.fTrimmedLength == 0) {
+            return NULL;
+        }
+        return &fFilterValues[filter.fDataLocation];
+    }
+
+  // Retrieves the filter for the offset 0, presumed to be the one and only.
+  // The offset and length of the filter values are put into the corresponding
+  // out arguments (see AddFilter). Note that |filterLegth| and
+  // |specifiedFilterLength| may be different if leading/trailing zeros of the
+  // original floating point form were clipped.
+  // There will be |filterLength| values in the return array.
+  // Returns NULL if the filter is 0-length (for instance when all floating
+  // point values passed to AddFilter were clipped to 0).
+    SK_API const ConvolutionFixed* GetSingleFilter(int* specifiedFilterLength,
+        int* filterOffset,
+        int* filterLength) const;
+
+    // Add another value to the fFilterValues array -- useful for 
+    // SIMD padding which happens outside of this class.
+    
+    void addFilterValue( ConvolutionFixed val ) {
+        fFilterValues.push_back( val );
+    }
+private:
+    struct FilterInstance {
+        // Offset within filterValues for this instance of the filter.
+        int fDataLocation;
+
+        // Distance from the left of the filter to the center. IN PIXELS
+        int fOffset;
+
+        // Number of values in this filter instance.
+        int fTrimmedLength;
+
+        // Filter length as specified. Note that this may be different from
+        // 'trimmed_length' if leading/trailing zeros of the original floating
+        // point form were clipped differently on each tail.
+        int fLength;
+    };
+
+    // Stores the information for each filter added to this class.
+    SkTArray<FilterInstance> fFilters;
+
+    // We store all the filter values in this flat list, indexed by
+    // |FilterInstance.data_location| to avoid the mallocs required for storing
+    // each one separately.
+    SkTArray<ConvolutionFixed> fFilterValues;
+
+    // The maximum size of any filter we've added.
+    int fMaxFilter;
+};
+
+typedef void (*SkConvolveVertically_pointer)(
+    const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+    int filterLength,
+    unsigned char* const* sourceDataRows,
+    int pixelWidth,
+    unsigned char* outRow,
+    bool hasAlpha);
+typedef void (*SkConvolve4RowsHorizontally_pointer)(
+    const unsigned char* srcData[4],
+    const SkConvolutionFilter1D& filter,
+    unsigned char* outRow[4]);
+typedef void (*SkConvolveHorizontally_pointer)(
+    const unsigned char* srcData,
+    const SkConvolutionFilter1D& filter,
+    unsigned char* outRow,
+    bool hasAlpha);
+typedef void (*SkConvolveFilterPadding_pointer)(
+    SkConvolutionFilter1D* filter);
+
+struct SkConvolutionProcs {
+  // This is how many extra pixels may be read by the
+  // conolve*horizontally functions.
+    int fExtraHorizontalReads;
+    SkConvolveVertically_pointer fConvolveVertically;
+    SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally;
+    SkConvolveHorizontally_pointer fConvolveHorizontally;
+    SkConvolveFilterPadding_pointer fApplySIMDPadding;
+};
+
+
+
+// Does a two-dimensional convolution on the given source image.
+//
+// It is assumed the source pixel offsets referenced in the input filters
+// reference only valid pixels, so the source image size is not required. Each
+// row of the source image starts |sourceByteRowStride| after the previous
+// one (this allows you to have rows with some padding at the end).
+//
+// The result will be put into the given output buffer. The destination image
+// size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
+// in rows of exactly xfilter.numValues() * 4 bytes.
+//
+// |sourceHasAlpha| is a hint that allows us to avoid doing computations on
+// the alpha channel if the image is opaque. If you don't know, set this to
+// true and it will work properly, but setting this to false will be a few
+// percent faster if you know the image is opaque.
+//
+// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
+// (this is ARGB when loaded into 32-bit words on a little-endian machine).
+SK_API void BGRAConvolve2D(const unsigned char* sourceData,
+    int sourceByteRowStride,
+    bool sourceHasAlpha,
+    const SkConvolutionFilter1D& xfilter,
+    const SkConvolutionFilter1D& yfilter,
+    int outputByteRowStride,
+    unsigned char* output,
+    SkConvolutionProcs* convolveProcs,
+    bool useSimdIfPossible);
+
+#endif  // SK_CONVOLVER_H
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index f992bcb636..95492c596d 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -11,6 +11,7 @@
 #include "SkColorPriv.h"
 #include "SkUnPreMultiply.h"
 #include "SkShader.h"
+#include "SkConvolver.h"
 
 #include "SkBitmapFilter_opts_SSE2.h"
 
@@ -180,3 +181,456 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
 
     }
 }
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+void convolveHorizontally_SSE2(const unsigned char* src_data,
+                               const SkConvolutionFilter1D& filter,
+                               unsigned char* out_row,
+                               bool /*has_alpha*/) {
+  int num_values = filter.numValues();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    __m128i accum = _mm_setzero_si128();
+
+    // Compute the first pixel in this row that the filter affects. It will
+    // touch |filter_length| pixels (4 bytes each) after this.
+    const __m128i* row_to_filter =
+        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+      __m128i coeff, coeff16;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Load four pixels => unpack the first two pixels to 16 bits =>
+      // multiply with coefficients => accumulate the convolution result.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Duplicate 3rd and 4th coefficients for all channels =>
+      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+      // => accumulate the convolution results.
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      // [16] a3 g3 b3 r3 a2 g2 b2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      // Advance the pixel and coefficients pointers.
+      row_to_filter += 1;
+      filter_values += 4;
+    }
+
+    // When |filter_length| is not divisible by 4, we need to decimate some of
+    // the filter coefficient that was loaded incorrectly to zero; Other than
+    // that the algorithm is same with above, exceot that the 4th pixel will be
+    // always absent.
+    int r = filter_length&3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8).
+      __m128i coeff, coeff16;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+      // Note: line buffer must be padded to align_up(filter_offset, 16).
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum = _mm_add_epi32(accum, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    accum = _mm_packs_epi32(accum, zero);
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    accum = _mm_packus_epi16(accum, zero);
+
+    // Store the pixel value of 32 bits.
+    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+    out_row += 4;
+  }
+}
+
+// Convolves horizontally along four rows. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
+// refer to that function for detailed comments.
+void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
+                                    const SkConvolutionFilter1D& filter,
+                                    unsigned char* out_row[4]) {
+  int num_values = filter.numValues();
+
+  int filter_offset, filter_length;
+  __m128i zero = _mm_setzero_si128();
+  __m128i mask[4];
+  // |mask| will be used to decimate all extra filter coefficients that are
+  // loaded by SIMD when |filter_length| is not divisible by 4.
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+  // Output one pixel each iteration, calculating all channels (RGBA) together.
+  for (int out_x = 0; out_x < num_values; out_x++) {
+    const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+        filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+    // four pixels in a column per iteration.
+    __m128i accum0 = _mm_setzero_si128();
+    __m128i accum1 = _mm_setzero_si128();
+    __m128i accum2 = _mm_setzero_si128();
+    __m128i accum3 = _mm_setzero_si128();
+    int start = (filter_offset<<2);
+    // We will load and accumulate with four coefficients per iteration.
+    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+      __m128i coeff, coeff16lo, coeff16hi;
+      // [16] xx xx xx xx c3 c2 c1 c0
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // [16] xx xx xx xx c1 c1 c0 c0
+      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      // [16] c1 c1 c1 c1 c0 c0 c0 c0
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      // [16] xx xx xx xx c3 c3 c2 c2
+      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      // [16] c3 c3 c3 c3 c2 c2 c2 c2
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum)                                          \
+      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
+      src16 = _mm_unpacklo_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      src16 = _mm_unpackhi_epi8(src8, zero);                           \
+      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
+      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t);                                 \
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
+      accum = _mm_add_epi32(accum, t)
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+
+      start += 16;
+      filter_values += 4;
+    }
+
+    int r = filter_length & 3;
+    if (r) {
+      // Note: filter_values must be padded to align_up(filter_offset, 8);
+      __m128i coeff;
+      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+      // Mask out extra filter taps.
+      coeff = _mm_and_si128(coeff, mask[r]);
+
+      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+      /* c1 c1 c1 c1 c0 c0 c0 c0 */
+      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+      __m128i src8, src16, mul_hi, mul_lo, t;
+
+      ITERATION(src_data[0] + start, accum0);
+      ITERATION(src_data[1] + start, accum1);
+      ITERATION(src_data[2] + start, accum2);
+      ITERATION(src_data[3] + start, accum3);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+    accum0 = _mm_packs_epi32(accum0, zero);
+    accum0 = _mm_packus_epi16(accum0, zero);
+    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_packs_epi32(accum1, zero);
+    accum1 = _mm_packus_epi16(accum1, zero);
+    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_packs_epi32(accum2, zero);
+    accum2 = _mm_packus_epi16(accum2, zero);
+    accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_packs_epi32(accum3, zero);
+    accum3 = _mm_packus_epi16(accum3, zero);
+
+    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+    out_row[0] += 4;
+    out_row[1] += 4;
+    out_row[2] += 4;
+    out_row[3] += 4;
+  }
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row) {
+  int width = pixel_width & ~3;
+
+  __m128i zero = _mm_setzero_si128();
+  __m128i accum0, accum1, accum2, accum3, coeff16;
+  const __m128i* src;
+  // Output four pixels per iteration (16 bytes).
+  for (int out_x = 0; out_x < width; out_x += 4) {
+
+    // Accumulated result for each pixel. 32 bits per RGBA channel.
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    accum3 = _mm_setzero_si128();
+
+    // Convolve with one filter coefficient per iteration.
+    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+      // Duplicate the filter coefficient 8 times.
+      // [16] cj cj cj cj cj cj cj cj
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+      // Load four pixels (16 bytes) together.
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][out_x << 2]);
+      __m128i src8 = _mm_loadu_si128(src);
+
+      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+
+      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+      // multiply with current coefficient => accumulate the result.
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+      // [32] a3 b3 g3 r3
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum3 = _mm_add_epi32(accum3, t);
+    }
+
+    // Shift right for fixed point implementation.
+    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+    accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+
+    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, accum3);
+
+    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+
+    if (has_alpha) {
+      // Compute the max(ri, gi, bi) for each pixel.
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+
+      // Make sure the value of alpha channel is always larger than maximum
+      // value of color channels.
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      // Set value of alpha channels to 0xFF.
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    // Store the convolution result (16 bytes) and advance the pixel pointers.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+    out_row += 16;
+  }
+
+  // When the width of the output is not divisible by 4, We need to save one
+  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+  if (pixel_width & 3) {
+    accum0 = _mm_setzero_si128();
+    accum1 = _mm_setzero_si128();
+    accum2 = _mm_setzero_si128();
+    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+      src = reinterpret_cast<const __m128i*>(
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
+      // [16] a1 b1 g1 r1 a0 b0 g0 r0
+      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a0 b0 g0 r0
+      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum0 = _mm_add_epi32(accum0, t);
+      // [32] a1 b1 g1 r1
+      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+      accum1 = _mm_add_epi32(accum1, t);
+      // [16] a3 b3 g3 r3 a2 b2 g2 r2
+      src16 = _mm_unpackhi_epi8(src8, zero);
+      mul_hi = _mm_mulhi_epi16(src16, coeff16);
+      mul_lo = _mm_mullo_epi16(src16, coeff16);
+      // [32] a2 b2 g2 r2
+      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+      accum2 = _mm_add_epi32(accum2, t);
+    }
+
+    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+    // [16] a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packs_epi32(accum0, accum1);
+    // [16] a3 b3 g3 r3 a2 b2 g2 r2
+    accum2 = _mm_packs_epi32(accum2, zero);
+    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+    accum0 = _mm_packus_epi16(accum0, accum2);
+    if (has_alpha) {
+      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+      __m128i a = _mm_srli_epi32(accum0, 8);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
+      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+      a = _mm_srli_epi32(accum0, 16);
+      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+      b = _mm_max_epu8(a, b);  // Max of r and g and b.
+      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+      b = _mm_slli_epi32(b, 24);
+      accum0 = _mm_max_epu8(b, accum0);
+    } else {
+      __m128i mask = _mm_set1_epi32(0xff000000);
+      accum0 = _mm_or_si128(accum0, mask);
+    }
+
+    for (int out_x = width; out_x < pixel_width; out_x++) {
+      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+      accum0 = _mm_srli_si128(accum0, 4);
+      out_row += 4;
+    }
+  }
+}
+
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row,
+                             bool has_alpha) {
+  if (has_alpha) {
+    convolveVertically_SSE2<true>(filter_values,
+                                  filter_length,
+                                  source_data_rows,
+                                  pixel_width,
+                                  out_row);
+  } else {
+    convolveVertically_SSE2<false>(filter_values,
+                                   filter_length,
+                                   source_data_rows,
+                                   pixel_width,
+                                   out_row);
+  }
+}
+
+void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
+    // Padding |paddingCount| of more dummy coefficients after the coefficients
+    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
+    // together to access invalid memory areas. We are not trying to align the
+    // coefficients right now due to the opaqueness of <vector> implementation.
+    // This has to be done after all |AddFilter| calls.
+    for (int i = 0; i < 8; ++i) {
+        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
+    }
+}
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.h b/src/opts/SkBitmapFilter_opts_SSE2.h
index c511acc83a..588f4ef18b 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.h
+++ b/src/opts/SkBitmapFilter_opts_SSE2.h
@@ -11,10 +11,27 @@
 #define SkBitmapFilter_opts_sse2_DEFINED
 
 #include "SkBitmapProcState.h"
+#include "SkConvolver.h"
 
 void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
                           SkPMColor *SK_RESTRICT colors, int count);
 void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
                 SkPMColor *SK_RESTRICT colors, int count);
 
+
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+                             int filter_length,
+                             unsigned char* const* source_data_rows,
+                             int pixel_width,
+                             unsigned char* out_row,
+                             bool has_alpha);
+void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
+                                    const SkConvolutionFilter1D& filter,
+                                    unsigned char* out_row[4]);
+void convolveHorizontally_SSE2(const unsigned char* src_data,
+                               const SkConvolutionFilter1D& filter,
+                               unsigned char* out_row,
+                               bool has_alpha);
+void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter);
+
 #endif
diff --git a/src/opts/SkBitmapProcState_opts_none.cpp b/src/opts/SkBitmapProcState_opts_none.cpp
index 3a186b5bfe..62af6d0f83 100644
--- a/src/opts/SkBitmapProcState_opts_none.cpp
+++ b/src/opts/SkBitmapProcState_opts_none.cpp
@@ -21,3 +21,6 @@
 
 // empty implementation just uses default supplied function pointers
 void SkBitmapProcState::platformProcs() {}
+
+// empty implementation just uses default supplied function pointers
+void SkBitmapProcState::platformScaleProc() {}
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 37ce9036ca..0bb450356d 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -107,6 +107,16 @@ static bool cachedHasSSSE3() {
 
 SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
 
+void SkBitmapProcState::platformConvolutionProcs() {
+    if (cachedHasSSE2()) {
+        fConvolutionProcs->fExtraHorizontalReads = 3;
+        fConvolutionProcs->fConvolveVertically = &convolveVertically_SSE2;
+        fConvolutionProcs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
+        fConvolutionProcs->fConvolveHorizontally = &convolveHorizontally_SSE2;
+        fConvolutionProcs->fApplySIMDPadding = &applySIMDPadding_SSE2;
+    }
+}
+
 void SkBitmapProcState::platformProcs() {
     if (cachedHasSSSE3()) {
 #if !defined(SK_BUILD_FOR_ANDROID)
@@ -151,9 +161,6 @@ void SkBitmapProcState::platformProcs() {
             if (fShaderProc32 == highQualityFilter) {
                 fShaderProc32 = highQualityFilter_SSE2;
             }
-            if (fShaderProc32 == highQualityFilter_ScaleOnly) {
-                fShaderProc32 = highQualityFilter_ScaleOnly_SSE2;
-            }
         }
     }
 }