aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar humper@google.com <humper@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-07-19 20:20:04 +0000
committerGravatar humper@google.com <humper@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-07-19 20:20:04 +0000
commit138ebc3e4061cf533ea2f7f3717239670fdc6e43 (patch)
treea9bbf3d68a36e5938b5a41df954ad0471b2e20e2
parentd322cf4939872bbff063468d7357c76eb6250d0f (diff)
The image resampling code has been transplanted from Chrome; it's incredibly fast.
We've tested this CL plumbed into Chrome and done benchmarking with excellent results. This CL can land independent of any Chrome changes; it's completely internal to skia. BUG= R=reed@google.com Review URL: https://codereview.chromium.org/19335002 git-svn-id: http://skia.googlecode.com/svn/trunk@10206 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--gm/downsamplebitmap.cpp2
-rw-r--r--gyp/core.gypi4
-rw-r--r--include/core/SkBitmap.h14
-rw-r--r--src/core/SkBitmapFilter.cpp227
-rw-r--r--src/core/SkBitmapFilter.h56
-rw-r--r--src/core/SkBitmapProcState.cpp57
-rw-r--r--src/core/SkBitmapProcState.h16
-rw-r--r--src/core/SkBitmapScaler.cpp315
-rw-r--r--src/core/SkBitmapScaler.h106
-rw-r--r--src/core/SkConvolver.cpp473
-rw-r--r--src/core/SkConvolver.h203
-rw-r--r--src/opts/SkBitmapFilter_opts_SSE2.cpp454
-rw-r--r--src/opts/SkBitmapFilter_opts_SSE2.h17
-rw-r--r--src/opts/SkBitmapProcState_opts_none.cpp3
-rw-r--r--src/opts/opts_check_SSE2.cpp13
15 files changed, 1692 insertions, 268 deletions
diff --git a/gm/downsamplebitmap.cpp b/gm/downsamplebitmap.cpp
index a59e5b85df..e34effa07f 100644
--- a/gm/downsamplebitmap.cpp
+++ b/gm/downsamplebitmap.cpp
@@ -75,7 +75,7 @@ protected:
curWidth = (int) (fBM.width() * curScale + 2);
curX += curWidth;
curScale *= 0.75f;
- } while (curX < 4*fBM.width());
+ } while (curWidth >= 2 && curX < 4*fBM.width());
}
private:
diff --git a/gyp/core.gypi b/gyp/core.gypi
index bf5e245924..eac96f6f9f 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -32,6 +32,8 @@
'<(skia_src_path)/core/SkBitmapProcState_matrix.h',
'<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp',
'<(skia_src_path)/core/SkBitmapProcState_sample.h',
+ '<(skia_src_path)/core/SkBitmapScaler.h',
+ '<(skia_src_path)/core/SkBitmapScaler.cpp',
'<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h',
'<(skia_src_path)/core/SkBitmapShaderTemplate.h',
'<(skia_src_path)/core/SkBitmap_scroll.cpp',
@@ -56,6 +58,8 @@
'<(skia_src_path)/core/SkComposeShader.cpp',
'<(skia_src_path)/core/SkConfig8888.cpp',
'<(skia_src_path)/core/SkConfig8888.h',
+ '<(skia_src_path)/core/SkConvolver.cpp',
+ '<(skia_src_path)/core/SkConvolver.h',
'<(skia_src_path)/core/SkCordic.cpp',
'<(skia_src_path)/core/SkCordic.h',
'<(skia_src_path)/core/SkCoreBlitters.h',
diff --git a/include/core/SkBitmap.h b/include/core/SkBitmap.h
index d5277c6c80..6d368f5b49 100644
--- a/include/core/SkBitmap.h
+++ b/include/core/SkBitmap.h
@@ -702,19 +702,7 @@ private:
int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy);
bool hasMipMap() const;
void freeMipMap();
-
- /** Make a scaled copy of this bitmap into the provided destination.
- * The caller is responsible for having set the width and height of the
- * provided destination bitmap, and also having allocated its pixel
- * memory.
- *
- * This function is temporary and for testing purposes only; it will
- * likely move once it has been properly plumbed into the bitmap
- * shader infrastructure.
- */
-
- void scale(SkBitmap *dst) const;
-
+
friend struct SkBitmapProcState;
};
diff --git a/src/core/SkBitmapFilter.cpp b/src/core/SkBitmapFilter.cpp
index 434ea9a536..060400944f 100644
--- a/src/core/SkBitmapFilter.cpp
+++ b/src/core/SkBitmapFilter.cpp
@@ -5,15 +5,23 @@
* found in the LICENSE file.
*/
+#include "SkErrorInternals.h"
+#include "SkConvolver.h"
#include "SkBitmapProcState.h"
#include "SkBitmap.h"
#include "SkColor.h"
#include "SkColorPriv.h"
+#include "SkConvolver.h"
#include "SkUnPreMultiply.h"
#include "SkShader.h"
#include "SkRTConf.h"
#include "SkMath.h"
+// These are the per-scanline callbacks that are used when we must resort to
+// resampling an image as it is blitted. Typically these are used only when
+// the image is rotated or has some other complex transformation applied.
+// Scaled images will usually be rescaled directly before rasterization.
+
void highQualityFilter(const SkBitmapProcState& s, int x, int y,
SkPMColor* SK_RESTRICT colors, int count) {
@@ -68,71 +76,15 @@ void highQualityFilter(const SkBitmapProcState& s, int x, int y,
}
}
-void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
- SkPMColor *SK_RESTRICT colors, int count) {
- const int maxX = s.fBitmap->width() - 1;
- const int maxY = s.fBitmap->height() - 1;
-
- SkPoint srcPt;
-
- s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
- SkFloatToScalar(y + 0.5f), &srcPt);
- srcPt.fY -= SK_ScalarHalf;
- int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
- int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()), maxY);
-
- while (count-- > 0) {
- s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f),
- SkFloatToScalar(y + 0.5f), &srcPt);
- srcPt.fX -= SK_ScalarHalf;
- srcPt.fY -= SK_ScalarHalf;
-
- SkScalar weight = 0;
- SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
-
- int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
- int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width()), maxX);
-
- for (int srcY = y0; srcY <= y1; srcY++) {
- SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
-
- for (int srcX = x0; srcX <= x1 ; srcX++) {
- SkScalar xWeight = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
-
- SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
-
- SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
- fr += combined_weight * SkGetPackedR32(c);
- fg += combined_weight * SkGetPackedG32(c);
- fb += combined_weight * SkGetPackedB32(c);
- fa += combined_weight * SkGetPackedA32(c);
- weight += combined_weight;
- }
- }
-
- fr = SkScalarDiv(fr, weight);
- fg = SkScalarDiv(fg, weight);
- fb = SkScalarDiv(fb, weight);
- fa = SkScalarDiv(fa, weight);
-
- int a = SkClampMax(SkScalarRoundToInt(fa), 255);
- int r = SkClampMax(SkScalarRoundToInt(fr), a);
- int g = SkClampMax(SkScalarRoundToInt(fg), a);
- int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
- *colors++ = SkPackARGB32(a, r, g, b);
-
- x++;
- }
-}
-
-SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which bitmap filter to use [mitchell, sinc, gaussian, triangle, box]");
+SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which scanline bitmap filter to use [mitchell, lanczos, hamming, gaussian, triangle, box]");
-static SkBitmapFilter *allocateBitmapFilter() {
+SkBitmapFilter *SkBitmapFilter::Allocate() {
if (!strcmp(c_bitmapFilter, "mitchell")) {
return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f));
- } else if (!strcmp(c_bitmapFilter, "sinc")) {
- return SkNEW_ARGS(SkSincFilter,(3));
+ } else if (!strcmp(c_bitmapFilter, "lanczos")) {
+ return SkNEW(SkLanczosFilter);
+ } else if (!strcmp(c_bitmapFilter, "hamming")) {
+ return SkNEW(SkHammingFilter);
} else if (!strcmp(c_bitmapFilter, "gaussian")) {
return SkNEW_ARGS(SkGaussianFilter,(2));
} else if (!strcmp(c_bitmapFilter, "triangle")) {
@@ -168,159 +120,12 @@ SkBitmapProcState::chooseBitmapFilterProc() {
}
if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) {
- fBitmapFilter = allocateBitmapFilter();
+ fBitmapFilter = SkBitmapFilter::Allocate();
}
- if (fInvType & SkMatrix::kAffine_Mask) {
+ if (fInvType & SkMatrix::kScale_Mask) {
return highQualityFilter;
- } else if (fInvType & SkMatrix::kScale_Mask) {
- return highQualityFilter_ScaleOnly;
} else {
return NULL;
}
}
-
-static void divideByWeights(SkScalar *sums, SkScalar *weights, SkBitmap *dst) {
- for (int y = 0 ; y < dst->height() ; y++) {
- for (int x = 0 ; x < dst->width() ; x++) {
- SkScalar fr = SkScalarDiv(sums[4*(y*dst->width() + x) + 0], weights[y*dst->width() + x]);
- SkScalar fg = SkScalarDiv(sums[4*(y*dst->width() + x) + 1], weights[y*dst->width() + x]);
- SkScalar fb = SkScalarDiv(sums[4*(y*dst->width() + x) + 2], weights[y*dst->width() + x]);
- SkScalar fa = SkScalarDiv(sums[4*(y*dst->width() + x) + 3], weights[y*dst->width() + x]);
- int a = SkClampMax(SkScalarRoundToInt(fa), 255);
- int r = SkClampMax(SkScalarRoundToInt(fr), a);
- int g = SkClampMax(SkScalarRoundToInt(fg), a);
- int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
- *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
- }
- }
-}
-
-static void upScaleHorizTranspose(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
- for (int y = 0 ; y < dst->height() ; y++) {
- for (int x = 0 ; x < dst->width() ; x++) {
- float sx = (y + 0.5f) / scale - 0.5f;
- int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->width()-1);
- int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->width()-1);
-
- SkScalar totalWeight = 0;
- SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
-
- for (int srcX = x0 ; srcX <= x1 ; srcX++) {
- SkScalar weight = filter->lookupScalar(sx - srcX);
- SkPMColor c = *src->getAddr32(srcX, x);
- fr += SkScalarMul(weight,SkGetPackedR32(c));
- fg += SkScalarMul(weight,SkGetPackedG32(c));
- fb += SkScalarMul(weight,SkGetPackedB32(c));
- fa += SkScalarMul(weight,SkGetPackedA32(c));
- totalWeight += weight;
- }
- fr = SkScalarDiv(fr,totalWeight);
- fg = SkScalarDiv(fg,totalWeight);
- fb = SkScalarDiv(fb,totalWeight);
- fa = SkScalarDiv(fa,totalWeight);
-
- int a = SkClampMax(SkScalarRoundToInt(fa), 255);
- int r = SkClampMax(SkScalarRoundToInt(fr), a);
- int g = SkClampMax(SkScalarRoundToInt(fg), a);
- int b = SkClampMax(SkScalarRoundToInt(fb), a);
-
- *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b);
- }
- }
-}
-
-static void downScaleHoriz(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
- SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
- SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
-
- SkAutoTDeleteArray<SkScalar> ada1(sums);
- SkAutoTDeleteArray<SkScalar> ada2(weights);
-
- memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
- memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
-
- for (int y = 0 ; y < src->height() ; y++) {
- for (int x = 0 ; x < src->width() ; x++) {
- // splat each source pixel into the destination image
- float dx = (x + 0.5f) * scale - 0.5f;
- int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->width()-1);
- int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->width()-1);
-
- SkPMColor c = *src->getAddr32(x,y);
-
- for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) {
- SkScalar weight = filter->lookup(dx - dst_x);
- sums[4*(y*dst->width() + dst_x) + 0] += weight*SkGetPackedR32(c);
- sums[4*(y*dst->width() + dst_x) + 1] += weight*SkGetPackedG32(c);
- sums[4*(y*dst->width() + dst_x) + 2] += weight*SkGetPackedB32(c);
- sums[4*(y*dst->width() + dst_x) + 3] += weight*SkGetPackedA32(c);
- weights[y*dst->width() + dst_x] += weight;
- }
- }
- }
-
- divideByWeights(sums, weights, dst);
-}
-
-static void downScaleVert(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) {
- SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4);
- SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height());
-
- SkAutoTDeleteArray<SkScalar> ada1(sums);
- SkAutoTDeleteArray<SkScalar> ada2(weights);
-
- memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4);
- memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar));
-
- for (int y = 0 ; y < src->height() ; y++) {
- for (int x = 0 ; x < src->width() ; x++) {
- // splat each source pixel into the destination image
- float dy = (y + 0.5f) * scale - 0.5f;
- int y0 = SkClampMax(sk_float_ceil2int(dy-filter->width()), dst->height()-1);
- int y1 = SkClampMax(sk_float_ceil2int(dy+filter->width()), dst->height()-1);
-
- SkPMColor c = *src->getAddr32(x,y);
-
- for (int dst_y = y0 ; dst_y <= y1 ; dst_y++) {
- SkScalar weight = filter->lookupScalar(dy - dst_y);
- sums[4*(dst_y*dst->width() + x) + 0] += weight*SkGetPackedR32(c);
- sums[4*(dst_y*dst->width() + x) + 1] += weight*SkGetPackedG32(c);
- sums[4*(dst_y*dst->width() + x) + 2] += weight*SkGetPackedB32(c);
- sums[4*(dst_y*dst->width() + x) + 3] += weight*SkGetPackedA32(c);
- weights[dst_y*dst->width() + x] += weight;
- }
- }
- }
-
- divideByWeights(sums, weights, dst);
-}
-
-void SkBitmap::scale(SkBitmap *dst) const {
-
- SkBitmap horizTemp;
-
- horizTemp.setConfig(SkBitmap::kARGB_8888_Config, height(), dst->width());
- horizTemp.allocPixels();
-
- SkBitmapFilter *filter = allocateBitmapFilter();
-
- float horizScale = float(dst->width()) / width();
-
- if (horizScale >= 1) {
- upScaleHorizTranspose(this, &horizTemp, horizScale, filter);
- } else if (horizScale < 1) {
- downScaleHoriz(this, &horizTemp, horizScale, filter);
- }
-
- float vertScale = float(dst->height()) / height();
-
- if (vertScale >= 1) {
- upScaleHorizTranspose(&horizTemp, dst, vertScale, filter);
- } else if (vertScale < 1) {
- downScaleVert(&horizTemp, dst, vertScale, filter);
- }
-
- SkDELETE(filter);
-}
diff --git a/src/core/SkBitmapFilter.h b/src/core/SkBitmapFilter.h
index 38c2448c69..6a9e3d7c01 100644
--- a/src/core/SkBitmapFilter.h
+++ b/src/core/SkBitmapFilter.h
@@ -26,28 +26,30 @@ class SkBitmapFilter {
fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1);
}
- SkFixed lookup( float x ) const {
+ SkFixed lookup(float x) const {
if (!fPrecomputed) {
precomputeTable();
}
int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
- return fFilterTable[ filter_idx ];
+ return fFilterTable[filter_idx];
}
- SkScalar lookupScalar( float x ) const {
+ SkScalar lookupScalar(float x) const {
if (!fPrecomputed) {
precomputeTable();
}
int filter_idx = int(sk_float_abs(x * fLookupMultiplier));
SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE);
- return fFilterTableScalar[ filter_idx ];
+ return fFilterTableScalar[filter_idx];
}
float width() const { return fWidth; }
float invWidth() const { return fInvWidth; }
virtual float evaluate(float x) const = 0;
virtual ~SkBitmapFilter() {}
+
+ static SkBitmapFilter* Allocate();
protected:
float fWidth;
float fInvWidth;
@@ -126,29 +128,47 @@ class SkBoxFilter: public SkBitmapFilter {
}
virtual float evaluate(float x) const SK_OVERRIDE {
- return 1;
+ return (x >= -fWidth && x < fWidth) ? 1.0f : 0.0f;
}
protected:
};
+class SkHammingFilter: public SkBitmapFilter {
+public:
+ SkHammingFilter(float width=1.f)
+ : SkBitmapFilter(width) {
+ }
+ virtual float evaluate(float x) const SK_OVERRIDE {
+ if (x <= -fWidth || x >= fWidth) {
+ return 0.0f; // Outside of the window.
+ }
+ if (x > -FLT_EPSILON && x < FLT_EPSILON) {
+ return 1.0f; // Special case the sinc discontinuity at the origin.
+ }
+ const float xpi = x * static_cast<float>(M_PI);
+
+ return ((sk_float_sin(xpi) / xpi) * // sinc(x)
+ (0.54f + 0.46f * sk_float_cos(xpi / fWidth))); // hamming(x)
+ }
+};
-class SkSincFilter: public SkBitmapFilter {
+class SkLanczosFilter: public SkBitmapFilter {
public:
- SkSincFilter(float t, float width=3.f)
- : SkBitmapFilter(width), tau(t) {
+ SkLanczosFilter(float width=3.f)
+ : SkBitmapFilter(width) {
}
virtual float evaluate(float x) const SK_OVERRIDE {
- x = sk_float_abs(x * fInvWidth);
- if (x < 1e-5f) return 1.f;
- if (x > 1.f) return 0.f;
- x *= SK_ScalarPI;
- float sinc = sk_float_sin(x) / x;
- float lanczos = sk_float_sin(x * tau) / (x * tau);
- return sinc * lanczos;
- }
- protected:
- float tau;
+ if (x <= -fWidth || x >= fWidth) {
+ return 0.0f; // Outside of the window.
+ }
+ if (x > -FLT_EPSILON && x < FLT_EPSILON) {
+ return 1.0f; // Special case the discontinuity at the origin.
+ }
+ float xpi = x * static_cast<float>(M_PI);
+ return (sk_float_sin(xpi) / xpi) * // sinc(x)
+ sk_float_sin(xpi / fWidth) / (xpi / fWidth); // sinc(x/fWidth)
+ }
};
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index a8a9b03d9a..57af144034 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -11,6 +11,7 @@
#include "SkPaint.h"
#include "SkShader.h" // for tilemodes
#include "SkUtilsArm.h"
+#include "SkBitmapScaler.h"
#if !SK_ARM_NEON_IS_NONE
// These are defined in src/opts/SkBitmapProcState_arm_neon.cpp
@@ -99,23 +100,45 @@ void SkBitmapProcState::possiblyScaleImage() {
if (fFilterQuality != kHQ_BitmapFilter) {
return;
}
-
- // STEP 1: UPSAMPLE?
-
- // Check to see if the transformation matrix is scaling up, and if
- // the matrix is simple, and if we're doing high quality scaling.
- // If so, do the bitmap scale here and remove the scaling component from the matrix.
-
- if (fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
- (fInvMatrix.getScaleX() < 1 || fInvMatrix.getScaleY() < 1) &&
+
+ // see if our platform has any specialized convolution code.
+
+
+ // Set up a pointer to a local (instead of storing the structure in the
+ // proc state) to avoid introducing a header dependency; this makes
+ // recompiles a lot less painful.
+
+ SkConvolutionProcs simd;
+ fConvolutionProcs = &simd;
+
+ fConvolutionProcs->fExtraHorizontalReads = 0;
+ fConvolutionProcs->fConvolveVertically = NULL;
+ fConvolutionProcs->fConvolve4RowsHorizontally = NULL;
+ fConvolutionProcs->fConvolveHorizontally = NULL;
+ fConvolutionProcs->fApplySIMDPadding = NULL;
+
+ this->platformConvolutionProcs();
+
+ // STEP 1: Highest quality direct scale?
+
+ // Check to see if the transformation matrix is simple, and if we're
+ // doing high quality scaling. If so, do the bitmap scale here and
+ // remove the scaling component from the matrix.
+
+ if (fFilterQuality == kHQ_BitmapFilter &&
+ fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) &&
fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) {
-
+
+ int dest_width = SkScalarCeilToInt(fOrigBitmap.width() / fInvMatrix.getScaleX());
+ int dest_height = SkScalarCeilToInt(fOrigBitmap.height() / fInvMatrix.getScaleY());
+
// All the criteria are met; let's make a new bitmap.
- fScaledBitmap.setConfig(SkBitmap::kARGB_8888_Config,
- (int)(fOrigBitmap.width() / fInvMatrix.getScaleX()),
- (int)(fOrigBitmap.height() / fInvMatrix.getScaleY()));
- fScaledBitmap.allocPixels();
- fOrigBitmap.scale(&fScaledBitmap);
+
+ fScaledBitmap = SkBitmapScaler::Resize( fOrigBitmap, SkBitmapScaler::RESIZE_BEST,
+ dest_width, dest_height, fConvolutionProcs );
+
+ fScaledBitmap.lockPixels();
+
fBitmap = &fScaledBitmap;
// set the inv matrix type to translate-only;
@@ -130,9 +153,9 @@ void SkBitmapProcState::possiblyScaleImage() {
return;
}
- if (!fOrigBitmap.hasMipMap()) {
+ if (!fOrigBitmap.hasMipMap() && fFilterQuality != kNone_BitmapFilter) {
- // STEP 2: DOWNSAMPLE
+ // STEP 2: MIPMAP DOWNSAMPLE?
// Check to see if the transformation matrix is scaling *down*.
// If so, automatically build mipmaps.
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index a644dd1e02..3c8e346807 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -31,6 +31,7 @@
#endif
class SkPaint;
+class SkConvolutionProcs;
struct SkBitmapProcState {
@@ -59,7 +60,7 @@ struct SkBitmapProcState {
const uint32_t[],
int count,
uint16_t colors[]);
-
+
typedef U16CPU (*FixedTileProc)(SkFixed); // returns 0..0xFFFF
typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int); // returns 0..0xF
typedef U16CPU (*IntTileProc)(int value, int count); // returns 0..count-1
@@ -78,6 +79,8 @@ struct SkBitmapProcState {
IntTileProc fIntTileProcY; // chooseProcs
SkFixed fFilterOneX;
SkFixed fFilterOneY;
+
+ SkConvolutionProcs* fConvolutionProcs; // possiblyScaleImage
SkPMColor fPaintPMColor; // chooseProcs - A8 config
SkFixed fInvSx; // chooseProcs
@@ -113,7 +116,12 @@ struct SkBitmapProcState {
implementation can do nothing (see SkBitmapProcState_opts_none.cpp)
*/
void platformProcs();
-
+
+ /** Platforms can also optionally overwrite the convolution functions
+ if we have SIMD versions of them.
+ */
+
+ void platformConvolutionProcs();
/** Given the byte size of the index buffer to be passed to the matrix proc,
return the maximum number of resulting pixels that can be computed
@@ -160,7 +168,7 @@ private:
void possiblyScaleImage();
- SkBitmapFilter *fBitmapFilter;
+ SkBitmapFilter* fBitmapFilter;
ShaderProc32 chooseBitmapFilterProc();
@@ -218,8 +226,6 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
void S32_D16_filter_DX(const SkBitmapProcState& s,
const uint32_t* xy, int count, uint16_t* colors);
-void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y,
- SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
diff --git a/src/core/SkBitmapScaler.cpp b/src/core/SkBitmapScaler.cpp
new file mode 100644
index 0000000000..7e840d2fdb
--- /dev/null
+++ b/src/core/SkBitmapScaler.cpp
@@ -0,0 +1,315 @@
+#include "SkBitmapScaler.h"
+#include "SkBitmapFilter.h"
+#include "SkRect.h"
+#include "SkTArray.h"
+#include "SkErrorInternals.h"
+#include "SkConvolver.h"
+
+// SkResizeFilter ----------------------------------------------------------------
+
+// Encapsulates computation and storage of the filters required for one complete
+// resize operation.
+class SkResizeFilter {
+public:
+ SkResizeFilter(SkBitmapScaler::ResizeMethod method,
+ int srcFullWidth, int srcFullHeight,
+ int destWidth, int destHeight,
+ const SkIRect& destSubset,
+ SkConvolutionProcs* convolveProcs);
+ ~SkResizeFilter() {
+ SkDELETE( fBitmapFilter );
+ }
+
+ // Returns the filled filter values.
+ const SkConvolutionFilter1D& xFilter() { return fXFilter; }
+ const SkConvolutionFilter1D& yFilter() { return fYFilter; }
+
+private:
+
+ SkBitmapFilter* fBitmapFilter;
+
+ // Computes one set of filters either horizontally or vertically. The caller
+ // will specify the "min" and "max" rather than the bottom/top and
+ // right/bottom so that the same code can be re-used in each dimension.
+ //
+ // |srcDependLo| and |srcDependSize| gives the range for the source
+ // depend rectangle (horizontally or vertically at the caller's discretion
+ // -- see above for what this means).
+ //
+ // Likewise, the range of destination values to compute and the scale factor
+ // for the transform is also specified.
+
+ void computeFilters(int srcSize,
+ int destSubsetLo, int destSubsetSize,
+ float scale,
+ SkConvolutionFilter1D* output,
+ SkConvolutionProcs* convolveProcs);
+
+ // Subset of scaled destination bitmap to compute.
+ SkIRect fOutBounds;
+
+ SkConvolutionFilter1D fXFilter;
+ SkConvolutionFilter1D fYFilter;
+};
+
+SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method,
+ int srcFullWidth, int srcFullHeight,
+ int destWidth, int destHeight,
+ const SkIRect& destSubset,
+ SkConvolutionProcs* convolveProcs)
+ : fOutBounds(destSubset) {
+
+ // method will only ever refer to an "algorithm method".
+ SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+ (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
+
+ switch(method) {
+ case SkBitmapScaler::RESIZE_BOX:
+ fBitmapFilter = SkNEW(SkBoxFilter);
+ break;
+ case SkBitmapScaler::RESIZE_TRIANGLE:
+ fBitmapFilter = SkNEW(SkTriangleFilter);
+ break;
+ case SkBitmapScaler::RESIZE_MITCHELL:
+ fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
+ break;
+ case SkBitmapScaler::RESIZE_HAMMING:
+ fBitmapFilter = SkNEW(SkHammingFilter);
+ break;
+ case SkBitmapScaler::RESIZE_LANCZOS3:
+ fBitmapFilter = SkNEW(SkLanczosFilter);
+ break;
+ default:
+ // NOTREACHED:
+ fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f));
+ break;
+ }
+
+
+ float scaleX = static_cast<float>(destWidth) /
+ static_cast<float>(srcFullWidth);
+ float scaleY = static_cast<float>(destHeight) /
+ static_cast<float>(srcFullHeight);
+
+ this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(),
+ scaleX, &fXFilter, convolveProcs);
+ this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(),
+ scaleY, &fYFilter, convolveProcs);
+}
+
+// TODO(egouriou): Take advantage of periods in the convolution.
+// Practical resizing filters are periodic outside of the border area.
+// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the
+// source become p pixels in the destination) will have a period of p.
+// A nice consequence is a period of 1 when downscaling by an integral
+// factor. Downscaling from typical display resolutions is also bound
+// to produce interesting periods as those are chosen to have multiple
+// small factors.
+// Small periods reduce computational load and improve cache usage if
+// the coefficients can be shared. For periods of 1 we can consider
+// loading the factors only once outside the borders.
+void SkResizeFilter::computeFilters(int srcSize,
+ int destSubsetLo, int destSubsetSize,
+ float scale,
+ SkConvolutionFilter1D* output,
+ SkConvolutionProcs* convolveProcs) {
+ int destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi)
+
+ // When we're doing a magnification, the scale will be larger than one. This
+ // means the destination pixels are much smaller than the source pixels, and
+ // that the range covered by the filter won't necessarily cover any source
+ // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+ // some computations.
+ float clampedScale = SkTMin(1.0f, scale);
+
+ // This is how many source pixels from the center we need to count
+ // to support the filtering function.
+ float srcSupport = fBitmapFilter->width() / clampedScale;
+
+ // Speed up the divisions below by turning them into multiplies.
+ float invScale = 1.0f / scale;
+
+ SkTArray<float> filterValues(64);
+ SkTArray<short> fixedFilterValues(64);
+
+ // Loop over all pixels in the output range. We will generate one set of
+ // filter values for each one. Those values will tell us how to blend the
+ // source pixels to compute the destination pixel.
+ for (int destSubsetI = destSubsetLo; destSubsetI < destSubsetHi;
+ destSubsetI++) {
+ // Reset the arrays. We don't declare them inside so they can re-use the
+ // same malloc-ed buffer.
+ filterValues.reset();
+ fixedFilterValues.reset();
+
+ // This is the pixel in the source directly under the pixel in the dest.
+ // Note that we base computations on the "center" of the pixels. To see
+ // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
+ // downscale should "cover" the pixels around the pixel with *its center*
+ // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
+ // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
+ float srcPixel = (static_cast<float>(destSubsetI) + 0.5f) * invScale;
+
+ // Compute the (inclusive) range of source pixels the filter covers.
+ int srcBegin = SkTMax(0, SkScalarFloorToInt(srcPixel - srcSupport));
+ int srcEnd = SkTMin(srcSize - 1, SkScalarCeilToInt(srcPixel + srcSupport));
+
+ // Compute the unnormalized filter value at each location of the source
+ // it covers.
+ float filterSum = 0.0f; // Sub of the filter values for normalizing.
+ for (int curFilterPixel = srcBegin; curFilterPixel <= srcEnd;
+ curFilterPixel++) {
+ // Distance from the center of the filter, this is the filter coordinate
+ // in source space. We also need to consider the center of the pixel
+ // when comparing distance against 'srcPixel'. In the 5x downscale
+ // example used above the distance from the center of the filter to
+ // the pixel with coordinates (2, 2) should be 0, because its center
+ // is at (2.5, 2.5).
+ float srcFilterDist =
+ ((static_cast<float>(curFilterPixel) + 0.5f) - srcPixel);
+
+ // Since the filter really exists in dest space, map it there.
+ float destFilterDist = srcFilterDist * clampedScale;
+
+ // Compute the filter value at that location.
+ float filterValue = fBitmapFilter->evaluate(destFilterDist);
+ filterValues.push_back(filterValue);
+
+ filterSum += filterValue;
+ }
+ SkASSERT(!filterValues.empty());
+
+ // The filter must be normalized so that we don't affect the brightness of
+ // the image. Convert to normalized fixed point.
+ short fixedSum = 0;
+ for (int i = 0; i < filterValues.count(); i++) {
+ short curFixed = output->FloatToFixed(filterValues[i] / filterSum);
+ fixedSum += curFixed;
+ fixedFilterValues.push_back(curFixed);
+ }
+
+ // The conversion to fixed point will leave some rounding errors, which
+ // we add back in to avoid affecting the brightness of the image. We
+ // arbitrarily add this to the center of the filter array (this won't always
+ // be the center of the filter function since it could get clipped on the
+ // edges, but it doesn't matter enough to worry about that case).
+ short leftovers = output->FloatToFixed(1.0f) - fixedSum;
+ fixedFilterValues[fixedFilterValues.count() / 2] += leftovers;
+
+ // Now it's ready to go.
+ output->AddFilter(srcBegin, &fixedFilterValues[0],
+ static_cast<int>(fixedFilterValues.count()));
+ }
+
+ if (convolveProcs->fApplySIMDPadding) {
+ convolveProcs->fApplySIMDPadding( output );
+ }
+}
+
+static SkBitmapScaler::ResizeMethod ResizeMethodToAlgorithmMethod(
+ SkBitmapScaler::ResizeMethod method) {
+ // Convert any "Quality Method" into an "Algorithm Method"
+ if (method >= SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD &&
+ method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD) {
+ return method;
+ }
+ // The call to SkBitmapScalerGtv::Resize() above took care of
+ // GPU-acceleration in the cases where it is possible. So now we just
+ // pick the appropriate software method for each resize quality.
+ switch (method) {
+ // Users of RESIZE_GOOD are willing to trade a lot of quality to
+ // get speed, allowing the use of linear resampling to get hardware
+ // acceleration (SRB). Hence any of our "good" software filters
+ // will be acceptable, so we use a triangle.
+ case SkBitmapScaler::RESIZE_GOOD:
+ return SkBitmapScaler::RESIZE_TRIANGLE;
+ // Users of RESIZE_BETTER are willing to trade some quality in order
+ // to improve performance, but are guaranteed not to devolve to a linear
+ // resampling. In visual tests we see that Hamming-1 is not as good as
+ // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is
+ // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed
+ // an acceptable trade-off between quality and speed.
+ case SkBitmapScaler::RESIZE_BETTER:
+ return SkBitmapScaler::RESIZE_HAMMING;
+ default:
+ return SkBitmapScaler::RESIZE_MITCHELL;
+ }
+}
+
+// static
+SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
+ ResizeMethod method,
+ int destWidth, int destHeight,
+ const SkIRect& destSubset,
+ SkConvolutionProcs* convolveProcs,
+ SkBitmap::Allocator* allocator) {
+ // Ensure that the ResizeMethod enumeration is sound.
+ SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) &&
+ (method <= RESIZE_LAST_QUALITY_METHOD)) ||
+ ((RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+ (method <= RESIZE_LAST_ALGORITHM_METHOD)));
+
+ SkIRect dest = { 0, 0, destWidth, destHeight };
+ if (!dest.contains(destSubset)) {
+ SkErrorInternals::SetError( kInvalidArgument_SkError,
+ "Sorry, you passed me a bitmap resize "
+ " method I have never heard of: %d",
+ method );
+ }
+
+ // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just
+ // return empty.
+ if (source.width() < 1 || source.height() < 1 ||
+ destWidth < 1 || destHeight < 1) {
+ return SkBitmap();
+ }
+
+ method = ResizeMethodToAlgorithmMethod(method);
+
+ // Check that we deal with an "algorithm methods" from this point onward.
+ SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) &&
+ (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD));
+
+ SkAutoLockPixels locker(source);
+ if (!source.readyToDraw() || source.config() != SkBitmap::kARGB_8888_Config)
+ return SkBitmap();
+
+ SkResizeFilter filter(method, source.width(), source.height(),
+ destWidth, destHeight, destSubset, convolveProcs);
+
+ // Get a source bitmap encompassing this touched area. We construct the
+ // offsets and row strides such that it looks like a new bitmap, while
+ // referring to the old data.
+ const unsigned char* sourceSubset =
+ reinterpret_cast<const unsigned char*>(source.getPixels());
+
+ // Convolve into the result.
+ SkBitmap result;
+ result.setConfig(SkBitmap::kARGB_8888_Config,
+ destSubset.width(), destSubset.height());
+ result.allocPixels(allocator, NULL);
+ if (!result.readyToDraw())
+ return SkBitmap();
+
+ BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()),
+ !source.isOpaque(), filter.xFilter(), filter.yFilter(),
+ static_cast<int>(result.rowBytes()),
+ static_cast<unsigned char*>(result.getPixels()),
+ convolveProcs, true);
+
+ // Preserve the "opaque" flag for use as an optimization later.
+ result.setIsOpaque(source.isOpaque());
+
+ return result;
+}
+
+// static
+SkBitmap SkBitmapScaler::Resize(const SkBitmap& source,
+ ResizeMethod method,
+ int destWidth, int destHeight,
+ SkConvolutionProcs* convolveProcs,
+ SkBitmap::Allocator* allocator) {
+ SkIRect destSubset = { 0, 0, destWidth, destHeight };
+ return Resize(source, method, destWidth, destHeight, destSubset,
+ convolveProcs, allocator);
+}
diff --git a/src/core/SkBitmapScaler.h b/src/core/SkBitmapScaler.h
new file mode 100644
index 0000000000..5682cc578d
--- /dev/null
+++ b/src/core/SkBitmapScaler.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkBitmapScaler_DEFINED
+#define SkBitmapScaler_DEFINED
+
+#include "SkBitmap.h"
+#include "SkConvolver.h"
+
+/** \class SkBitmapScaler
+
+ Provides the interface for high quality image resampling.
+ */
+
+class SK_API SkBitmapScaler {
+public:
+ enum ResizeMethod {
+ // Quality Methods
+ //
+ // Those enumeration values express a desired quality/speed tradeoff.
+ // They are translated into an algorithm-specific method that depends
+ // on the capabilities (CPU, GPU) of the underlying platform.
+ // It is possible for all three methods to be mapped to the same
+ // algorithm on a given platform.
+
+ // Good quality resizing. Fastest resizing with acceptable visual quality.
+ // This is typically intended for use during interactive layouts
+ // where slower platforms may want to trade image quality for large
+ // increase in resizing performance.
+ //
+ // For example the resizing implementation may devolve to linear
+ // filtering if this enables GPU acceleration to be used.
+ //
+ // Note that the underlying resizing method may be determined
+ // on the fly based on the parameters for a given resize call.
+ // For example an implementation using a GPU-based linear filter
+ // in the common case may still use a higher-quality software-based
+ // filter in cases where using the GPU would actually be slower - due
+ // to too much latency - or impossible - due to image format or size
+ // constraints.
+ RESIZE_GOOD,
+
+ // Medium quality resizing. Close to high quality resizing (better
+ // than linear interpolation) with potentially some quality being
+ // traded-off for additional speed compared to RESIZE_BEST.
+ //
+ // This is intended, for example, for generation of large thumbnails
+ // (hundreds of pixels in each dimension) from large sources, where
+ // a linear filter would produce too many artifacts but where
+ // a RESIZE_HIGH might be too costly time-wise.
+ RESIZE_BETTER,
+
+ // High quality resizing. The algorithm is picked to favor image quality.
+ RESIZE_BEST,
+
+ //
+ // Algorithm-specific enumerations
+ //
+
+ // Box filter. This is a weighted average of all of the pixels touching
+ // the destination pixel. For enlargement, this is nearest neighbor.
+ //
+ // You probably don't want this, it is here for testing since it is easy to
+ // compute. Use RESIZE_LANCZOS3 instead.
+ RESIZE_BOX,
+ RESIZE_TRIANGLE,
+ RESIZE_LANCZOS3,
+ RESIZE_HAMMING,
+ RESIZE_MITCHELL,
+
+ // enum aliases for first and last methods by algorithm or by quality.
+ RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD,
+ RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST,
+ RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX,
+ RESIZE_LAST_ALGORITHM_METHOD = RESIZE_MITCHELL,
+ };
+
+ // Resizes the given source bitmap using the specified resize method, so that
+ // the entire image is (dest_size) big. The dest_subset is the rectangle in
+ // this destination image that should actually be returned.
+ //
+ // The output image will be (dest_subset.width(), dest_subset.height()). This
+ // will save work if you do not need the entire bitmap.
+ //
+ // The destination subset must be smaller than the destination image.
+ static SkBitmap Resize(const SkBitmap& source,
+ ResizeMethod method,
+ int dest_width, int dest_height,
+ const SkIRect& dest_subset,
+ SkConvolutionProcs *convolveProcs = NULL,
+ SkBitmap::Allocator* allocator = NULL);
+
+ // Alternate version for resizing and returning the entire bitmap rather than
+ // a subset.
+ static SkBitmap Resize(const SkBitmap& source,
+ ResizeMethod method,
+ int dest_width, int dest_height,
+ SkConvolutionProcs *convolveProcs = NULL,
+ SkBitmap::Allocator* allocator = NULL);
+};
+
+#endif
diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp
new file mode 100644
index 0000000000..54e46b63fe
--- /dev/null
+++ b/src/core/SkConvolver.cpp
@@ -0,0 +1,473 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "SkConvolver.h"
+#include "SkSize.h"
+#include "SkTypes.h"
+
+namespace {
+
+ // Converts the argument to an 8-bit unsigned value by clamping to the range
+ // 0-255.
+ inline unsigned char ClampTo8(int a) {
+ if (static_cast<unsigned>(a) < 256) {
+ return a; // Avoid the extra check in the common case.
+ }
+ if (a < 0) {
+ return 0;
+ }
+ return 255;
+ }
+
+ // Takes the value produced by accumulating element-wise product of image with
+ // a kernel and brings it back into range.
+ // All of the filter scaling factors are in fixed point with kShiftBits bits of
+ // fractional part.
+ inline unsigned char BringBackTo8(int a, bool takeAbsolute) {
+ a >>= SkConvolutionFilter1D::kShiftBits;
+ if (takeAbsolute) {
+ a = abs(a);
+ }
+ return ClampTo8(a);
+ }
+
+ // Stores a list of rows in a circular buffer. The usage is you write into it
+ // by calling AdvanceRow. It will keep track of which row in the buffer it
+ // should use next, and the total number of rows added.
+ class CircularRowBuffer {
+ public:
+ // The number of pixels in each row is given in |sourceRowPixelWidth|.
+ // The maximum number of rows needed in the buffer is |maxYFilterSize|
+ // (we only need to store enough rows for the biggest filter).
+ //
+ // We use the |firstInputRow| to compute the coordinates of all of the
+ // following rows returned by Advance().
+ CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize,
+ int firstInputRow)
+ : fRowByteWidth(destRowPixelWidth * 4),
+ fNumRows(maxYFilterSize),
+ fNextRow(0),
+ fNextRowCoordinate(firstInputRow) {
+ fBuffer.reset(fRowByteWidth * maxYFilterSize);
+ fRowAddresses.reset(fNumRows);
+ }
+
+ // Moves to the next row in the buffer, returning a pointer to the beginning
+ // of it.
+ unsigned char* advanceRow() {
+ unsigned char* row = &fBuffer[fNextRow * fRowByteWidth];
+ fNextRowCoordinate++;
+
+ // Set the pointer to the next row to use, wrapping around if necessary.
+ fNextRow++;
+ if (fNextRow == fNumRows) {
+ fNextRow = 0;
+ }
+ return row;
+ }
+
+ // Returns a pointer to an "unrolled" array of rows. These rows will start
+ // at the y coordinate placed into |*firstRowIndex| and will continue in
+ // order for the maximum number of rows in this circular buffer.
+ //
+ // The |firstRowIndex_| may be negative. This means the circular buffer
+ // starts before the top of the image (it hasn't been filled yet).
+ unsigned char* const* GetRowAddresses(int* firstRowIndex) {
+ // Example for a 4-element circular buffer holding coords 6-9.
+ // Row 0 Coord 8
+ // Row 1 Coord 9
+ // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10.
+ // Row 3 Coord 7
+ //
+ // The "next" row is also the first (lowest) coordinate. This computation
+ // may yield a negative value, but that's OK, the math will work out
+ // since the user of this buffer will compute the offset relative
+ // to the firstRowIndex and the negative rows will never be used.
+ *firstRowIndex = fNextRowCoordinate - fNumRows;
+
+ int curRow = fNextRow;
+ for (int i = 0; i < fNumRows; i++) {
+ fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth];
+
+ // Advance to the next row, wrapping if necessary.
+ curRow++;
+ if (curRow == fNumRows) {
+ curRow = 0;
+ }
+ }
+ return &fRowAddresses[0];
+ }
+
+ private:
+ // The buffer storing the rows. They are packed, each one fRowByteWidth.
+ SkTArray<unsigned char> fBuffer;
+
+ // Number of bytes per row in the |buffer|.
+ int fRowByteWidth;
+
+ // The number of rows available in the buffer.
+ int fNumRows;
+
+ // The next row index we should write into. This wraps around as the
+ // circular buffer is used.
+ int fNextRow;
+
+ // The y coordinate of the |fNextRow|. This is incremented each time a
+ // new row is appended and does not wrap.
+ int fNextRowCoordinate;
+
+ // Buffer used by GetRowAddresses().
+ SkTArray<unsigned char*> fRowAddresses;
+ };
+
+// Convolves horizontally along a single row. The row data is given in
+// |srcData| and continues for the numValues() of the filter.
+template<bool hasAlpha>
+ void ConvolveHorizontally(const unsigned char* srcData,
+ const SkConvolutionFilter1D& filter,
+ unsigned char* outRow) {
+ // Loop over each pixel on this row in the output image.
+ int numValues = filter.numValues();
+ for (int outX = 0; outX < numValues; outX++) {
+ // Get the filter that determines the current output pixel.
+ int filterOffset, filterLength;
+ const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
+ filter.FilterForValue(outX, &filterOffset, &filterLength);
+
+ // Compute the first pixel in this row that the filter affects. It will
+ // touch |filterLength| pixels (4 bytes each) after this.
+ const unsigned char* rowToFilter = &srcData[filterOffset * 4];
+
+ // Apply the filter to the row to get the destination pixel in |accum|.
+ int accum[4] = {0};
+ for (int filterX = 0; filterX < filterLength; filterX++) {
+ SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
+ accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
+ accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
+ accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
+ if (hasAlpha) {
+ accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
+ }
+ }
+
+ // Bring this value back in range. All of the filter scaling factors
+ // are in fixed point with kShiftBits bits of fractional part.
+ accum[0] >>= SkConvolutionFilter1D::kShiftBits;
+ accum[1] >>= SkConvolutionFilter1D::kShiftBits;
+ accum[2] >>= SkConvolutionFilter1D::kShiftBits;
+ if (hasAlpha) {
+ accum[3] >>= SkConvolutionFilter1D::kShiftBits;
+ }
+
+ // Store the new pixel.
+ outRow[outX * 4 + 0] = ClampTo8(accum[0]);
+ outRow[outX * 4 + 1] = ClampTo8(accum[1]);
+ outRow[outX * 4 + 2] = ClampTo8(accum[2]);
+ if (hasAlpha) {
+ outRow[outX * 4 + 3] = ClampTo8(accum[3]);
+ }
+ }
+ }
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |sourceDataRows| array, with each row
+// being |pixelWidth| wide.
+//
+// The output must have room for |pixelWidth * 4| bytes.
+template<bool hasAlpha>
+ void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+ int filterLength,
+ unsigned char* const* sourceDataRows,
+ int pixelWidth,
+ unsigned char* outRow) {
+ // We go through each column in the output and do a vertical convolution,
+ // generating one output pixel each time.
+ for (int outX = 0; outX < pixelWidth; outX++) {
+ // Compute the number of bytes over in each row that the current column
+ // we're convolving starts at. The pixel will cover the next 4 bytes.
+ int byteOffset = outX * 4;
+
+ // Apply the filter to one column of pixels.
+ int accum[4] = {0};
+ for (int filterY = 0; filterY < filterLength; filterY++) {
+ SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
+ accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
+ accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
+ accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
+ if (hasAlpha) {
+ accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
+ }
+ }
+
+ // Bring this value back in range. All of the filter scaling factors
+ // are in fixed point with kShiftBits bits of precision.
+ accum[0] >>= SkConvolutionFilter1D::kShiftBits;
+ accum[1] >>= SkConvolutionFilter1D::kShiftBits;
+ accum[2] >>= SkConvolutionFilter1D::kShiftBits;
+ if (hasAlpha) {
+ accum[3] >>= SkConvolutionFilter1D::kShiftBits;
+ }
+
+ // Store the new pixel.
+ outRow[byteOffset + 0] = ClampTo8(accum[0]);
+ outRow[byteOffset + 1] = ClampTo8(accum[1]);
+ outRow[byteOffset + 2] = ClampTo8(accum[2]);
+ if (hasAlpha) {
+ unsigned char alpha = ClampTo8(accum[3]);
+
+ // Make sure the alpha channel doesn't come out smaller than any of the
+ // color channels. We use premultipled alpha channels, so this should
+ // never happen, but rounding errors will cause this from time to time.
+ // These "impossible" colors will cause overflows (and hence random pixel
+ // values) when the resulting bitmap is drawn to the screen.
+ //
+ // We only need to do this when generating the final output row (here).
+ int maxColorChannel = SkTMax(outRow[byteOffset + 0],
+ SkTMax(outRow[byteOffset + 1],
+ outRow[byteOffset + 2]));
+ if (alpha < maxColorChannel) {
+ outRow[byteOffset + 3] = maxColorChannel;
+ } else {
+ outRow[byteOffset + 3] = alpha;
+ }
+ } else {
+ // No alpha channel, the image is opaque.
+ outRow[byteOffset + 3] = 0xff;
+ }
+ }
+ }
+
+ void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+ int filterLength,
+ unsigned char* const* sourceDataRows,
+ int pixelWidth,
+ unsigned char* outRow,
+ bool sourceHasAlpha) {
+ if (sourceHasAlpha) {
+ ConvolveVertically<true>(filterValues, filterLength,
+ sourceDataRows, pixelWidth,
+ outRow);
+ } else {
+ ConvolveVertically<false>(filterValues, filterLength,
+ sourceDataRows, pixelWidth,
+ outRow);
+ }
+ }
+
+} // namespace
+
+// SkConvolutionFilter1D ---------------------------------------------------------
+
+SkConvolutionFilter1D::SkConvolutionFilter1D()
+: fMaxFilter(0) {
+}
+
+SkConvolutionFilter1D::~SkConvolutionFilter1D() {
+}
+
+void SkConvolutionFilter1D::AddFilter(int filterOffset,
+ const float* filterValues,
+ int filterLength) {
+ SkASSERT(filterLength > 0);
+
+ SkTArray<ConvolutionFixed> fixedValues;
+ fixedValues.reset(filterLength);
+
+ for (int i = 0; i < filterLength; ++i) {
+ fixedValues.push_back(FloatToFixed(filterValues[i]));
+ }
+
+ AddFilter(filterOffset, &fixedValues[0], filterLength);
+}
+
+void SkConvolutionFilter1D::AddFilter(int filterOffset,
+ const ConvolutionFixed* filterValues,
+ int filterLength) {
+ // It is common for leading/trailing filter values to be zeros. In such
+ // cases it is beneficial to only store the central factors.
+ // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
+ // a 1080p image this optimization gives a ~10% speed improvement.
+ int filterSize = filterLength;
+ int firstNonZero = 0;
+ while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) {
+ firstNonZero++;
+ }
+
+ if (firstNonZero < filterLength) {
+ // Here we have at least one non-zero factor.
+ int lastNonZero = filterLength - 1;
+ while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) {
+ lastNonZero--;
+ }
+
+ filterOffset += firstNonZero;
+ filterLength = lastNonZero + 1 - firstNonZero;
+ SkASSERT(filterLength > 0);
+
+ for (int i = firstNonZero; i <= lastNonZero; i++) {
+ fFilterValues.push_back(filterValues[i]);
+ }
+ } else {
+ // Here all the factors were zeroes.
+ filterLength = 0;
+ }
+
+ FilterInstance instance;
+
+ // We pushed filterLength elements onto fFilterValues
+ instance.fDataLocation = (static_cast<int>(fFilterValues.count()) -
+ filterLength);
+ instance.fOffset = filterOffset;
+ instance.fTrimmedLength = filterLength;
+ instance.fLength = filterSize;
+ fFilters.push_back(instance);
+
+ fMaxFilter = SkTMax(fMaxFilter, filterLength);
+}
+
+const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter(
+ int* specifiedFilterlength,
+ int* filterOffset,
+ int* filterLength) const {
+ const FilterInstance& filter = fFilters[0];
+ *filterOffset = filter.fOffset;
+ *filterLength = filter.fTrimmedLength;
+ *specifiedFilterlength = filter.fLength;
+ if (filter.fTrimmedLength == 0) {
+ return NULL;
+ }
+
+ return &fFilterValues[filter.fDataLocation];
+}
+
+void BGRAConvolve2D(const unsigned char* sourceData,
+ int sourceByteRowStride,
+ bool sourceHasAlpha,
+ const SkConvolutionFilter1D& filterX,
+ const SkConvolutionFilter1D& filterY,
+ int outputByteRowStride,
+ unsigned char* output,
+ SkConvolutionProcs* convolveProcs,
+ bool useSimdIfPossible) {
+
+ int maxYFilterSize = filterY.maxFilter();
+
+ // The next row in the input that we will generate a horizontally
+ // convolved row for. If the filter doesn't start at the beginning of the
+ // image (this is the case when we are only resizing a subset), then we
+ // don't want to generate any output rows before that. Compute the starting
+ // row for convolution as the first pixel for the first vertical filter.
+ int filterOffset, filterLength;
+ const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
+ filterY.FilterForValue(0, &filterOffset, &filterLength);
+ int nextXRow = filterOffset;
+
+ // We loop over each row in the input doing a horizontal convolution. This
+ // will result in a horizontally convolved image. We write the results into
+ // a circular buffer of convolved rows and do vertical convolution as rows
+ // are available. This prevents us from having to store the entire
+ // intermediate image and helps cache coherency.
+ // We will need four extra rows to allow horizontal convolution could be done
+ // simultaneously. We also pad each row in row buffer to be aligned-up to
+ // 16 bytes.
+ // TODO(jiesun): We do not use aligned load from row buffer in vertical
+ // convolution pass yet. Somehow Windows does not like it.
+ int rowBufferWidth = (filterX.numValues() + 15) & ~0xF;
+ int rowBufferHeight = maxYFilterSize +
+ (convolveProcs->fConvolve4RowsHorizontally ? 4 : 0);
+ CircularRowBuffer rowBuffer(rowBufferWidth,
+ rowBufferHeight,
+ filterOffset);
+
+ // Loop over every possible output row, processing just enough horizontal
+ // convolutions to run each subsequent vertical convolution.
+ SkASSERT(outputByteRowStride >= filterX.numValues() * 4);
+ int numOutputRows = filterY.numValues();
+
+ // We need to check which is the last line to convolve before we advance 4
+ // lines in one iteration.
+ int lastFilterOffset, lastFilterLength;
+
+ // SSE2 can access up to 3 extra pixels past the end of the
+ // buffer. At the bottom of the image, we have to be careful
+ // not to access data past the end of the buffer. Normally
+ // we fall back to the C++ implementation for the last row.
+ // If the last row is less than 3 pixels wide, we may have to fall
+ // back to the C++ version for more rows. Compute how many
+ // rows we need to avoid the SSE implementation for here.
+ filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset,
+ &lastFilterLength);
+ int avoidSimdRows = 1 + convolveProcs->fExtraHorizontalReads /
+ (lastFilterOffset + lastFilterLength);
+
+ filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
+ &lastFilterLength);
+
+ for (int outY = 0; outY < numOutputRows; outY++) {
+ filterValues = filterY.FilterForValue(outY,
+ &filterOffset, &filterLength);
+
+ // Generate output rows until we have enough to run the current filter.
+ while (nextXRow < filterOffset + filterLength) {
+ if (convolveProcs->fConvolve4RowsHorizontally &&
+ nextXRow + 3 < lastFilterOffset + lastFilterLength -
+ avoidSimdRows) {
+ const unsigned char* src[4];
+ unsigned char* outRow[4];
+ for (int i = 0; i < 4; ++i) {
+ src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride];
+ outRow[i] = rowBuffer.advanceRow();
+ }
+ convolveProcs->fConvolve4RowsHorizontally(src, filterX, outRow);
+ nextXRow += 4;
+ } else {
+ // Check if we need to avoid SSE2 for this row.
+ if (convolveProcs->fConvolveHorizontally &&
+ nextXRow < lastFilterOffset + lastFilterLength -
+ avoidSimdRows) {
+ convolveProcs->fConvolveHorizontally(
+ &sourceData[nextXRow * sourceByteRowStride],
+ filterX, rowBuffer.advanceRow(), sourceHasAlpha);
+ } else {
+ if (sourceHasAlpha) {
+ ConvolveHorizontally<true>(
+ &sourceData[nextXRow * sourceByteRowStride],
+ filterX, rowBuffer.advanceRow());
+ } else {
+ ConvolveHorizontally<false>(
+ &sourceData[nextXRow * sourceByteRowStride],
+ filterX, rowBuffer.advanceRow());
+ }
+ }
+ nextXRow++;
+ }
+ }
+
+ // Compute where in the output image this row of final data will go.
+ unsigned char* curOutputRow = &output[outY * outputByteRowStride];
+
+ // Get the list of rows that the circular buffer has, in order.
+ int firstRowInCircularBuffer;
+ unsigned char* const* rowsToConvolve =
+ rowBuffer.GetRowAddresses(&firstRowInCircularBuffer);
+
+ // Now compute the start of the subset of those rows that the filter
+ // needs.
+ unsigned char* const* firstRowForFilter =
+ &rowsToConvolve[filterOffset - firstRowInCircularBuffer];
+
+ if (convolveProcs->fConvolveVertically) {
+ convolveProcs->fConvolveVertically(filterValues, filterLength,
+ firstRowForFilter,
+ filterX.numValues(), curOutputRow,
+ sourceHasAlpha);
+ } else {
+ ConvolveVertically(filterValues, filterLength,
+ firstRowForFilter,
+ filterX.numValues(), curOutputRow,
+ sourceHasAlpha);
+ }
+ }
+}
diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h
new file mode 100644
index 0000000000..a2758e57a8
--- /dev/null
+++ b/src/core/SkConvolver.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SK_CONVOLVER_H
+#define SK_CONVOLVER_H
+
+#include "SkSize.h"
+#include "SkTypes.h"
+#include "SkTArray.h"
+
+// avoid confusion with Mac OS X's math library (Carbon)
+#if defined(__APPLE__)
+#undef FloatToConvolutionFixed
+#undef ConvolutionFixedToFloat
+#endif
+
+// Represents a filter in one dimension. Each output pixel has one entry in this
+// object for the filter values contributing to it. You build up the filter
+// list by calling AddFilter for each output pixel (in order).
+//
+// We do 2-dimensional convolution by first convolving each row by one
+// SkConvolutionFilter1D, then convolving each column by another one.
+//
+// Entries are stored in ConvolutionFixed point, shifted left by kShiftBits.
+class SkConvolutionFilter1D {
+public:
+ typedef short ConvolutionFixed;
+
+ // The number of bits that ConvolutionFixed point values are shifted by.
+ enum { kShiftBits = 14 };
+
+ SK_API SkConvolutionFilter1D();
+ SK_API ~SkConvolutionFilter1D();
+
+ // Convert between floating point and our ConvolutionFixed point representation.
+ static ConvolutionFixed FloatToFixed(float f) {
+ return static_cast<ConvolutionFixed>(f * (1 << kShiftBits));
+ }
+ static unsigned char FixedToChar(ConvolutionFixed x) {
+ return static_cast<unsigned char>(x >> kShiftBits);
+ }
+ static float FixedToFloat(ConvolutionFixed x) {
+ // The cast relies on ConvolutionFixed being a short, implying that on
+ // the platforms we care about all (16) bits will fit into
+ // the mantissa of a (32-bit) float.
+ SK_COMPILE_ASSERT(sizeof(ConvolutionFixed) == 2, ConvolutionFixed_type_should_fit_in_float_mantissa);
+ float raw = static_cast<float>(x);
+ return ldexpf(raw, -kShiftBits);
+ }
+
+ // Returns the maximum pixel span of a filter.
+ int maxFilter() const { return fMaxFilter; }
+
+ // Returns the number of filters in this filter. This is the dimension of the
+ // output image.
+ int numValues() const { return static_cast<int>(fFilters.count()); }
+
+ // Appends the given list of scaling values for generating a given output
+ // pixel. |filterOffset| is the distance from the edge of the image to where
+ // the scaling factors start. The scaling factors apply to the source pixels
+ // starting from this position, and going for the next |filterLength| pixels.
+ //
+ // You will probably want to make sure your input is normalized (that is,
+ // all entries in |filterValuesg| sub to one) to prevent affecting the overall
+ // brighness of the image.
+ //
+ // The filterLength must be > 0.
+ //
+ // This version will automatically convert your input to ConvolutionFixed point.
+ SK_API void AddFilter(int filterOffset,
+ const float* filterValues,
+ int filterLength);
+
+ // Same as the above version, but the input is already ConvolutionFixed point.
+ void AddFilter(int filterOffset,
+ const ConvolutionFixed* filterValues,
+ int filterLength);
+
+ // Retrieves a filter for the given |valueOffset|, a position in the output
+ // image in the direction we're convolving. The offset and length of the
+ // filter values are put into the corresponding out arguments (see AddFilter
+ // above for what these mean), and a pointer to the first scaling factor is
+ // returned. There will be |filterLength| values in this array.
+ inline const ConvolutionFixed* FilterForValue(int valueOffset,
+ int* filterOffset,
+ int* filterLength) const {
+ const FilterInstance& filter = fFilters[valueOffset];
+ *filterOffset = filter.fOffset;
+ *filterLength = filter.fTrimmedLength;
+ if (filter.fTrimmedLength == 0) {
+ return NULL;
+ }
+ return &fFilterValues[filter.fDataLocation];
+ }
+
+ // Retrieves the filter for the offset 0, presumed to be the one and only.
+ // The offset and length of the filter values are put into the corresponding
+ // out arguments (see AddFilter). Note that |filterLegth| and
+ // |specifiedFilterLength| may be different if leading/trailing zeros of the
+ // original floating point form were clipped.
+ // There will be |filterLength| values in the return array.
+ // Returns NULL if the filter is 0-length (for instance when all floating
+ // point values passed to AddFilter were clipped to 0).
+ SK_API const ConvolutionFixed* GetSingleFilter(int* specifiedFilterLength,
+ int* filterOffset,
+ int* filterLength) const;
+
+ // Add another value to the fFilterValues array -- useful for
+ // SIMD padding which happens outside of this class.
+
+ void addFilterValue( ConvolutionFixed val ) {
+ fFilterValues.push_back( val );
+ }
+private:
+ struct FilterInstance {
+ // Offset within filterValues for this instance of the filter.
+ int fDataLocation;
+
+ // Distance from the left of the filter to the center. IN PIXELS
+ int fOffset;
+
+ // Number of values in this filter instance.
+ int fTrimmedLength;
+
+ // Filter length as specified. Note that this may be different from
+ // 'trimmed_length' if leading/trailing zeros of the original floating
+ // point form were clipped differently on each tail.
+ int fLength;
+ };
+
+ // Stores the information for each filter added to this class.
+ SkTArray<FilterInstance> fFilters;
+
+ // We store all the filter values in this flat list, indexed by
+ // |FilterInstance.data_location| to avoid the mallocs required for storing
+ // each one separately.
+ SkTArray<ConvolutionFixed> fFilterValues;
+
+ // The maximum size of any filter we've added.
+ int fMaxFilter;
+};
+
+typedef void (*SkConvolveVertically_pointer)(
+ const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
+ int filterLength,
+ unsigned char* const* sourceDataRows,
+ int pixelWidth,
+ unsigned char* outRow,
+ bool hasAlpha);
+typedef void (*SkConvolve4RowsHorizontally_pointer)(
+ const unsigned char* srcData[4],
+ const SkConvolutionFilter1D& filter,
+ unsigned char* outRow[4]);
+typedef void (*SkConvolveHorizontally_pointer)(
+ const unsigned char* srcData,
+ const SkConvolutionFilter1D& filter,
+ unsigned char* outRow,
+ bool hasAlpha);
+typedef void (*SkConvolveFilterPadding_pointer)(
+ SkConvolutionFilter1D* filter);
+
+struct SkConvolutionProcs {
+ // This is how many extra pixels may be read by the
+ // conolve*horizontally functions.
+ int fExtraHorizontalReads;
+ SkConvolveVertically_pointer fConvolveVertically;
+ SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally;
+ SkConvolveHorizontally_pointer fConvolveHorizontally;
+ SkConvolveFilterPadding_pointer fApplySIMDPadding;
+};
+
+
+
+// Does a two-dimensional convolution on the given source image.
+//
+// It is assumed the source pixel offsets referenced in the input filters
+// reference only valid pixels, so the source image size is not required. Each
+// row of the source image starts |sourceByteRowStride| after the previous
+// one (this allows you to have rows with some padding at the end).
+//
+// The result will be put into the given output buffer. The destination image
+// size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
+// in rows of exactly xfilter.numValues() * 4 bytes.
+//
+// |sourceHasAlpha| is a hint that allows us to avoid doing computations on
+// the alpha channel if the image is opaque. If you don't know, set this to
+// true and it will work properly, but setting this to false will be a few
+// percent faster if you know the image is opaque.
+//
+// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
+// (this is ARGB when loaded into 32-bit words on a little-endian machine).
+SK_API void BGRAConvolve2D(const unsigned char* sourceData,
+ int sourceByteRowStride,
+ bool sourceHasAlpha,
+ const SkConvolutionFilter1D& xfilter,
+ const SkConvolutionFilter1D& yfilter,
+ int outputByteRowStride,
+ unsigned char* output,
+ SkConvolutionProcs* convolveProcs,
+ bool useSimdIfPossible);
+
+#endif // SK_CONVOLVER_H
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index f992bcb636..95492c596d 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -11,6 +11,7 @@
#include "SkColorPriv.h"
#include "SkUnPreMultiply.h"
#include "SkShader.h"
+#include "SkConvolver.h"
#include "SkBitmapFilter_opts_SSE2.h"
@@ -180,3 +181,456 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
}
}
+
+// Convolves horizontally along a single row. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+void convolveHorizontally_SSE2(const unsigned char* src_data,
+ const SkConvolutionFilter1D& filter,
+ unsigned char* out_row,
+ bool /*has_alpha*/) {
+ int num_values = filter.numValues();
+
+ int filter_offset, filter_length;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask[4];
+ // |mask| will be used to decimate all extra filter coefficients that are
+ // loaded by SIMD when |filter_length| is not divisible by 4.
+ // mask[0] is not used in following algorithm.
+ mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+ mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+ mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+ // Output one pixel each iteration, calculating all channels (RGBA) together.
+ for (int out_x = 0; out_x < num_values; out_x++) {
+ const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+ filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+ __m128i accum = _mm_setzero_si128();
+
+ // Compute the first pixel in this row that the filter affects. It will
+ // touch |filter_length| pixels (4 bytes each) after this.
+ const __m128i* row_to_filter =
+ reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
+
+ // We will load and accumulate with four coefficients per iteration.
+ for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
+
+ // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
+ __m128i coeff, coeff16;
+ // [16] xx xx xx xx c3 c2 c1 c0
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // [16] xx xx xx xx c1 c1 c0 c0
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ // [16] c1 c1 c1 c1 c0 c0 c0 c0
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+ // Load four pixels => unpack the first two pixels to 16 bits =>
+ // multiply with coefficients => accumulate the convolution result.
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src8 = _mm_loadu_si128(row_to_filter);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0*c0 b0*c0 g0*c0 r0*c0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ // [32] a1*c1 b1*c1 g1*c1 r1*c1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ // Duplicate 3rd and 4th coefficients for all channels =>
+ // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
+ // => accumulate the convolution results.
+ // [16] xx xx xx xx c3 c3 c2 c2
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ // [16] c3 c3 c3 c3 c2 c2 c2 c2
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+ // [16] a3 g3 b3 r3 a2 g2 b2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2*c2 b2*c2 g2*c2 r2*c2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ // [32] a3*c3 b3*c3 g3*c3 r3*c3
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ // Advance the pixel and coefficients pointers.
+ row_to_filter += 1;
+ filter_values += 4;
+ }
+
+ // When |filter_length| is not divisible by 4, we need to decimate some of
+ // the filter coefficient that was loaded incorrectly to zero; Other than
+ // that the algorithm is same with above, exceot that the 4th pixel will be
+ // always absent.
+ int r = filter_length&3;
+ if (r) {
+ // Note: filter_values must be padded to align_up(filter_offset, 8).
+ __m128i coeff, coeff16;
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // Mask out extra filter taps.
+ coeff = _mm_and_si128(coeff, mask[r]);
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+
+ // Note: line buffer must be padded to align_up(filter_offset, 16).
+ // We resolve this by use C-version for the last horizontal line.
+ __m128i src8 = _mm_loadu_si128(row_to_filter);
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum = _mm_add_epi32(accum, t);
+ }
+
+ // Shift right for fixed point implementation.
+ accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
+
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ accum = _mm_packs_epi32(accum, zero);
+ // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+ accum = _mm_packus_epi16(accum, zero);
+
+ // Store the pixel value of 32 bits.
+ *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
+ out_row += 4;
+ }
+}
+
+// Convolves horizontally along four rows. The row data is given in
+// |src_data| and continues for the num_values() of the filter.
+// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
+// refer to that function for detailed comments.
+void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
+ const SkConvolutionFilter1D& filter,
+ unsigned char* out_row[4]) {
+ int num_values = filter.numValues();
+
+ int filter_offset, filter_length;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask[4];
+ // |mask| will be used to decimate all extra filter coefficients that are
+ // loaded by SIMD when |filter_length| is not divisible by 4.
+ // mask[0] is not used in following algorithm.
+ mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+ mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+ mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+
+ // Output one pixel each iteration, calculating all channels (RGBA) together.
+ for (int out_x = 0; out_x < num_values; out_x++) {
+ const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
+ filter.FilterForValue(out_x, &filter_offset, &filter_length);
+
+ // four pixels in a column per iteration.
+ __m128i accum0 = _mm_setzero_si128();
+ __m128i accum1 = _mm_setzero_si128();
+ __m128i accum2 = _mm_setzero_si128();
+ __m128i accum3 = _mm_setzero_si128();
+ int start = (filter_offset<<2);
+ // We will load and accumulate with four coefficients per iteration.
+ for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
+ __m128i coeff, coeff16lo, coeff16hi;
+ // [16] xx xx xx xx c3 c2 c1 c0
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // [16] xx xx xx xx c1 c1 c0 c0
+ coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ // [16] c1 c1 c1 c1 c0 c0 c0 c0
+ coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+ // [16] xx xx xx xx c3 c3 c2 c2
+ coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ // [16] c3 c3 c3 c3 c2 c2 c2 c2
+ coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+ __m128i src8, src16, mul_hi, mul_lo, t;
+
+#define ITERATION(src, accum) \
+ src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
+ src16 = _mm_unpacklo_epi8(src8, zero); \
+ mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
+ mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ src16 = _mm_unpackhi_epi8(src8, zero); \
+ mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
+ mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t); \
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
+ accum = _mm_add_epi32(accum, t)
+
+ ITERATION(src_data[0] + start, accum0);
+ ITERATION(src_data[1] + start, accum1);
+ ITERATION(src_data[2] + start, accum2);
+ ITERATION(src_data[3] + start, accum3);
+
+ start += 16;
+ filter_values += 4;
+ }
+
+ int r = filter_length & 3;
+ if (r) {
+ // Note: filter_values must be padded to align_up(filter_offset, 8);
+ __m128i coeff;
+ coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
+ // Mask out extra filter taps.
+ coeff = _mm_and_si128(coeff, mask[r]);
+
+ __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
+ /* c1 c1 c1 c1 c0 c0 c0 c0 */
+ coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
+ __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
+ coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
+
+ __m128i src8, src16, mul_hi, mul_lo, t;
+
+ ITERATION(src_data[0] + start, accum0);
+ ITERATION(src_data[1] + start, accum1);
+ ITERATION(src_data[2] + start, accum2);
+ ITERATION(src_data[3] + start, accum3);
+ }
+
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum0 = _mm_packs_epi32(accum0, zero);
+ accum0 = _mm_packus_epi16(accum0, zero);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_packs_epi32(accum1, zero);
+ accum1 = _mm_packus_epi16(accum1, zero);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_packs_epi32(accum2, zero);
+ accum2 = _mm_packus_epi16(accum2, zero);
+ accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+ accum3 = _mm_packs_epi32(accum3, zero);
+ accum3 = _mm_packus_epi16(accum3, zero);
+
+ *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
+ *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
+ *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
+ *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
+
+ out_row[0] += 4;
+ out_row[1] += 4;
+ out_row[2] += 4;
+ out_row[3] += 4;
+ }
+}
+
+// Does vertical convolution to produce one output row. The filter values and
+// length are given in the first two parameters. These are applied to each
+// of the rows pointed to in the |source_data_rows| array, with each row
+// being |pixel_width| wide.
+//
+// The output must have room for |pixel_width * 4| bytes.
+template<bool has_alpha>
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+ int filter_length,
+ unsigned char* const* source_data_rows,
+ int pixel_width,
+ unsigned char* out_row) {
+ int width = pixel_width & ~3;
+
+ __m128i zero = _mm_setzero_si128();
+ __m128i accum0, accum1, accum2, accum3, coeff16;
+ const __m128i* src;
+ // Output four pixels per iteration (16 bytes).
+ for (int out_x = 0; out_x < width; out_x += 4) {
+
+ // Accumulated result for each pixel. 32 bits per RGBA channel.
+ accum0 = _mm_setzero_si128();
+ accum1 = _mm_setzero_si128();
+ accum2 = _mm_setzero_si128();
+ accum3 = _mm_setzero_si128();
+
+ // Convolve with one filter coefficient per iteration.
+ for (int filter_y = 0; filter_y < filter_length; filter_y++) {
+
+ // Duplicate the filter coefficient 8 times.
+ // [16] cj cj cj cj cj cj cj cj
+ coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+
+ // Load four pixels (16 bytes) together.
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ src = reinterpret_cast<const __m128i*>(
+ &source_data_rows[filter_y][out_x << 2]);
+ __m128i src8 = _mm_loadu_si128(src);
+
+ // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
+ // multiply with current coefficient => accumulate the result.
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0 b0 g0 r0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum0 = _mm_add_epi32(accum0, t);
+ // [32] a1 b1 g1 r1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum1 = _mm_add_epi32(accum1, t);
+
+ // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
+ // multiply with current coefficient => accumulate the result.
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2 b2 g2 r2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum2 = _mm_add_epi32(accum2, t);
+ // [32] a3 b3 g3 r3
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum3 = _mm_add_epi32(accum3, t);
+ }
+
+ // Shift right for fixed point implementation.
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
+
+ // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packs_epi32(accum0, accum1);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ accum2 = _mm_packs_epi32(accum2, accum3);
+
+ // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packus_epi16(accum0, accum2);
+
+ if (has_alpha) {
+ // Compute the max(ri, gi, bi) for each pixel.
+ // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+ __m128i a = _mm_srli_epi32(accum0, 8);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
+ // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+ a = _mm_srli_epi32(accum0, 16);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ b = _mm_max_epu8(a, b); // Max of r and g and b.
+ // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+ b = _mm_slli_epi32(b, 24);
+
+ // Make sure the value of alpha channel is always larger than maximum
+ // value of color channels.
+ accum0 = _mm_max_epu8(b, accum0);
+ } else {
+ // Set value of alpha channels to 0xFF.
+ __m128i mask = _mm_set1_epi32(0xff000000);
+ accum0 = _mm_or_si128(accum0, mask);
+ }
+
+ // Store the convolution result (16 bytes) and advance the pixel pointers.
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
+ out_row += 16;
+ }
+
+ // When the width of the output is not divisible by 4, We need to save one
+ // pixel (4 bytes) each time. And also the fourth pixel is always absent.
+ if (pixel_width & 3) {
+ accum0 = _mm_setzero_si128();
+ accum1 = _mm_setzero_si128();
+ accum2 = _mm_setzero_si128();
+ for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
+ coeff16 = _mm_set1_epi16(filter_values[filter_y]);
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ src = reinterpret_cast<const __m128i*>(
+ &source_data_rows[filter_y][width<<2]);
+ __m128i src8 = _mm_loadu_si128(src);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ __m128i src16 = _mm_unpacklo_epi8(src8, zero);
+ __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a0 b0 g0 r0
+ __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum0 = _mm_add_epi32(accum0, t);
+ // [32] a1 b1 g1 r1
+ t = _mm_unpackhi_epi16(mul_lo, mul_hi);
+ accum1 = _mm_add_epi32(accum1, t);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ src16 = _mm_unpackhi_epi8(src8, zero);
+ mul_hi = _mm_mulhi_epi16(src16, coeff16);
+ mul_lo = _mm_mullo_epi16(src16, coeff16);
+ // [32] a2 b2 g2 r2
+ t = _mm_unpacklo_epi16(mul_lo, mul_hi);
+ accum2 = _mm_add_epi32(accum2, t);
+ }
+
+ accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
+ accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
+ accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packs_epi32(accum0, accum1);
+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
+ accum2 = _mm_packs_epi32(accum2, zero);
+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+ accum0 = _mm_packus_epi16(accum0, accum2);
+ if (has_alpha) {
+ // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
+ __m128i a = _mm_srli_epi32(accum0, 8);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
+ // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
+ a = _mm_srli_epi32(accum0, 16);
+ // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
+ b = _mm_max_epu8(a, b); // Max of r and g and b.
+ // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
+ b = _mm_slli_epi32(b, 24);
+ accum0 = _mm_max_epu8(b, accum0);
+ } else {
+ __m128i mask = _mm_set1_epi32(0xff000000);
+ accum0 = _mm_or_si128(accum0, mask);
+ }
+
+ for (int out_x = width; out_x < pixel_width; out_x++) {
+ *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
+ accum0 = _mm_srli_si128(accum0, 4);
+ out_row += 4;
+ }
+ }
+}
+
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+ int filter_length,
+ unsigned char* const* source_data_rows,
+ int pixel_width,
+ unsigned char* out_row,
+ bool has_alpha) {
+ if (has_alpha) {
+ convolveVertically_SSE2<true>(filter_values,
+ filter_length,
+ source_data_rows,
+ pixel_width,
+ out_row);
+ } else {
+ convolveVertically_SSE2<false>(filter_values,
+ filter_length,
+ source_data_rows,
+ pixel_width,
+ out_row);
+ }
+}
+
+void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
+ // Padding |paddingCount| of more dummy coefficients after the coefficients
+ // of last filter to prevent SIMD instructions which load 8 or 16 bytes
+ // together to access invalid memory areas. We are not trying to align the
+ // coefficients right now due to the opaqueness of <vector> implementation.
+ // This has to be done after all |AddFilter| calls.
+ for (int i = 0; i < 8; ++i) {
+ filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
+ }
+}
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.h b/src/opts/SkBitmapFilter_opts_SSE2.h
index c511acc83a..588f4ef18b 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.h
+++ b/src/opts/SkBitmapFilter_opts_SSE2.h
@@ -11,10 +11,27 @@
#define SkBitmapFilter_opts_sse2_DEFINED
#include "SkBitmapProcState.h"
+#include "SkConvolver.h"
void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y,
SkPMColor *SK_RESTRICT colors, int count);
+
+void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
+ int filter_length,
+ unsigned char* const* source_data_rows,
+ int pixel_width,
+ unsigned char* out_row,
+ bool has_alpha);
+void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
+ const SkConvolutionFilter1D& filter,
+ unsigned char* out_row[4]);
+void convolveHorizontally_SSE2(const unsigned char* src_data,
+ const SkConvolutionFilter1D& filter,
+ unsigned char* out_row,
+ bool has_alpha);
+void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter);
+
#endif
diff --git a/src/opts/SkBitmapProcState_opts_none.cpp b/src/opts/SkBitmapProcState_opts_none.cpp
index 3a186b5bfe..62af6d0f83 100644
--- a/src/opts/SkBitmapProcState_opts_none.cpp
+++ b/src/opts/SkBitmapProcState_opts_none.cpp
@@ -21,3 +21,6 @@
// empty implementation just uses default supplied function pointers
void SkBitmapProcState::platformProcs() {}
+
+// empty implementation just uses default supplied function pointers
+void SkBitmapProcState::platformScaleProc() {}
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 37ce9036ca..0bb450356d 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -107,6 +107,16 @@ static bool cachedHasSSSE3() {
SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
+void SkBitmapProcState::platformConvolutionProcs() {
+ if (cachedHasSSE2()) {
+ fConvolutionProcs->fExtraHorizontalReads = 3;
+ fConvolutionProcs->fConvolveVertically = &convolveVertically_SSE2;
+ fConvolutionProcs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
+ fConvolutionProcs->fConvolveHorizontally = &convolveHorizontally_SSE2;
+ fConvolutionProcs->fApplySIMDPadding = &applySIMDPadding_SSE2;
+ }
+}
+
void SkBitmapProcState::platformProcs() {
if (cachedHasSSSE3()) {
#if !defined(SK_BUILD_FOR_ANDROID)
@@ -151,9 +161,6 @@ void SkBitmapProcState::platformProcs() {
if (fShaderProc32 == highQualityFilter) {
fShaderProc32 = highQualityFilter_SSE2;
}
- if (fShaderProc32 == highQualityFilter_ScaleOnly) {
- fShaderProc32 = highQualityFilter_ScaleOnly_SSE2;
- }
}
}
}