diff options
-rw-r--r-- | gm/downsamplebitmap.cpp | 2 | ||||
-rw-r--r-- | gyp/core.gypi | 4 | ||||
-rw-r--r-- | include/core/SkBitmap.h | 14 | ||||
-rw-r--r-- | src/core/SkBitmapFilter.cpp | 227 | ||||
-rw-r--r-- | src/core/SkBitmapFilter.h | 56 | ||||
-rw-r--r-- | src/core/SkBitmapProcState.cpp | 57 | ||||
-rw-r--r-- | src/core/SkBitmapProcState.h | 16 | ||||
-rw-r--r-- | src/core/SkBitmapScaler.cpp | 315 | ||||
-rw-r--r-- | src/core/SkBitmapScaler.h | 106 | ||||
-rw-r--r-- | src/core/SkConvolver.cpp | 473 | ||||
-rw-r--r-- | src/core/SkConvolver.h | 203 | ||||
-rw-r--r-- | src/opts/SkBitmapFilter_opts_SSE2.cpp | 454 | ||||
-rw-r--r-- | src/opts/SkBitmapFilter_opts_SSE2.h | 17 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_none.cpp | 3 | ||||
-rw-r--r-- | src/opts/opts_check_SSE2.cpp | 13 |
15 files changed, 1692 insertions, 268 deletions
diff --git a/gm/downsamplebitmap.cpp b/gm/downsamplebitmap.cpp index a59e5b85df..e34effa07f 100644 --- a/gm/downsamplebitmap.cpp +++ b/gm/downsamplebitmap.cpp @@ -75,7 +75,7 @@ protected: curWidth = (int) (fBM.width() * curScale + 2); curX += curWidth; curScale *= 0.75f; - } while (curX < 4*fBM.width()); + } while (curWidth >= 2 && curX < 4*fBM.width()); } private: diff --git a/gyp/core.gypi b/gyp/core.gypi index bf5e245924..eac96f6f9f 100644 --- a/gyp/core.gypi +++ b/gyp/core.gypi @@ -32,6 +32,8 @@ '<(skia_src_path)/core/SkBitmapProcState_matrix.h', '<(skia_src_path)/core/SkBitmapProcState_matrixProcs.cpp', '<(skia_src_path)/core/SkBitmapProcState_sample.h', + '<(skia_src_path)/core/SkBitmapScaler.h', + '<(skia_src_path)/core/SkBitmapScaler.cpp', '<(skia_src_path)/core/SkBitmapShader16BilerpTemplate.h', '<(skia_src_path)/core/SkBitmapShaderTemplate.h', '<(skia_src_path)/core/SkBitmap_scroll.cpp', @@ -56,6 +58,8 @@ '<(skia_src_path)/core/SkComposeShader.cpp', '<(skia_src_path)/core/SkConfig8888.cpp', '<(skia_src_path)/core/SkConfig8888.h', + '<(skia_src_path)/core/SkConvolver.cpp', + '<(skia_src_path)/core/SkConvolver.h', '<(skia_src_path)/core/SkCordic.cpp', '<(skia_src_path)/core/SkCordic.h', '<(skia_src_path)/core/SkCoreBlitters.h', diff --git a/include/core/SkBitmap.h b/include/core/SkBitmap.h index d5277c6c80..6d368f5b49 100644 --- a/include/core/SkBitmap.h +++ b/include/core/SkBitmap.h @@ -702,19 +702,7 @@ private: int extractMipLevel(SkBitmap* dst, SkFixed sx, SkFixed sy); bool hasMipMap() const; void freeMipMap(); - - /** Make a scaled copy of this bitmap into the provided destination. - * The caller is responsible for having set the width and height of the - * provided destination bitmap, and also having allocated its pixel - * memory. - * - * This function is temporary and for testing purposes only; it will - * likely move once it has been properly plumbed into the bitmap - * shader infrastructure. - */ - - void scale(SkBitmap *dst) const; - + friend struct SkBitmapProcState; }; diff --git a/src/core/SkBitmapFilter.cpp b/src/core/SkBitmapFilter.cpp index 434ea9a536..060400944f 100644 --- a/src/core/SkBitmapFilter.cpp +++ b/src/core/SkBitmapFilter.cpp @@ -5,15 +5,23 @@ * found in the LICENSE file. */ +#include "SkErrorInternals.h" +#include "SkConvolver.h" #include "SkBitmapProcState.h" #include "SkBitmap.h" #include "SkColor.h" #include "SkColorPriv.h" +#include "SkConvolver.h" #include "SkUnPreMultiply.h" #include "SkShader.h" #include "SkRTConf.h" #include "SkMath.h" +// These are the per-scanline callbacks that are used when we must resort to +// resampling an image as it is blitted. Typically these are used only when +// the image is rotated or has some other complex transformation applied. +// Scaled images will usually be rescaled directly before rasterization. + void highQualityFilter(const SkBitmapProcState& s, int x, int y, SkPMColor* SK_RESTRICT colors, int count) { @@ -68,71 +76,15 @@ void highQualityFilter(const SkBitmapProcState& s, int x, int y, } } -void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y, - SkPMColor *SK_RESTRICT colors, int count) { - const int maxX = s.fBitmap->width() - 1; - const int maxY = s.fBitmap->height() - 1; - - SkPoint srcPt; - - s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f), - SkFloatToScalar(y + 0.5f), &srcPt); - srcPt.fY -= SK_ScalarHalf; - int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY); - int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()), maxY); - - while (count-- > 0) { - s.fInvProc(s.fInvMatrix, SkFloatToScalar(x + 0.5f), - SkFloatToScalar(y + 0.5f), &srcPt); - srcPt.fX -= SK_ScalarHalf; - srcPt.fY -= SK_ScalarHalf; - - SkScalar weight = 0; - SkScalar fr = 0, fg = 0, fb = 0, fa = 0; - - int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX); - int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width()), maxX); - - for (int srcY = y0; srcY <= y1; srcY++) { - SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY)); - - for (int srcX = x0; srcX <= x1 ; srcX++) { - SkScalar xWeight = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX)); - - SkScalar combined_weight = SkScalarMul(xWeight, yWeight); - - SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY); - fr += combined_weight * SkGetPackedR32(c); - fg += combined_weight * SkGetPackedG32(c); - fb += combined_weight * SkGetPackedB32(c); - fa += combined_weight * SkGetPackedA32(c); - weight += combined_weight; - } - } - - fr = SkScalarDiv(fr, weight); - fg = SkScalarDiv(fg, weight); - fb = SkScalarDiv(fb, weight); - fa = SkScalarDiv(fa, weight); - - int a = SkClampMax(SkScalarRoundToInt(fa), 255); - int r = SkClampMax(SkScalarRoundToInt(fr), a); - int g = SkClampMax(SkScalarRoundToInt(fg), a); - int b = SkClampMax(SkScalarRoundToInt(fb), a); - - *colors++ = SkPackARGB32(a, r, g, b); - - x++; - } -} - -SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which bitmap filter to use [mitchell, sinc, gaussian, triangle, box]"); +SK_CONF_DECLARE(const char *, c_bitmapFilter, "bitmap.filter", "mitchell", "Which scanline bitmap filter to use [mitchell, lanczos, hamming, gaussian, triangle, box]"); -static SkBitmapFilter *allocateBitmapFilter() { +SkBitmapFilter *SkBitmapFilter::Allocate() { if (!strcmp(c_bitmapFilter, "mitchell")) { return SkNEW_ARGS(SkMitchellFilter,(1.f/3.f,1.f/3.f)); - } else if (!strcmp(c_bitmapFilter, "sinc")) { - return SkNEW_ARGS(SkSincFilter,(3)); + } else if (!strcmp(c_bitmapFilter, "lanczos")) { + return SkNEW(SkLanczosFilter); + } else if (!strcmp(c_bitmapFilter, "hamming")) { + return SkNEW(SkHammingFilter); } else if (!strcmp(c_bitmapFilter, "gaussian")) { return SkNEW_ARGS(SkGaussianFilter,(2)); } else if (!strcmp(c_bitmapFilter, "triangle")) { @@ -168,159 +120,12 @@ SkBitmapProcState::chooseBitmapFilterProc() { } if (fInvType & (SkMatrix::kAffine_Mask | SkMatrix::kScale_Mask)) { - fBitmapFilter = allocateBitmapFilter(); + fBitmapFilter = SkBitmapFilter::Allocate(); } - if (fInvType & SkMatrix::kAffine_Mask) { + if (fInvType & SkMatrix::kScale_Mask) { return highQualityFilter; - } else if (fInvType & SkMatrix::kScale_Mask) { - return highQualityFilter_ScaleOnly; } else { return NULL; } } - -static void divideByWeights(SkScalar *sums, SkScalar *weights, SkBitmap *dst) { - for (int y = 0 ; y < dst->height() ; y++) { - for (int x = 0 ; x < dst->width() ; x++) { - SkScalar fr = SkScalarDiv(sums[4*(y*dst->width() + x) + 0], weights[y*dst->width() + x]); - SkScalar fg = SkScalarDiv(sums[4*(y*dst->width() + x) + 1], weights[y*dst->width() + x]); - SkScalar fb = SkScalarDiv(sums[4*(y*dst->width() + x) + 2], weights[y*dst->width() + x]); - SkScalar fa = SkScalarDiv(sums[4*(y*dst->width() + x) + 3], weights[y*dst->width() + x]); - int a = SkClampMax(SkScalarRoundToInt(fa), 255); - int r = SkClampMax(SkScalarRoundToInt(fr), a); - int g = SkClampMax(SkScalarRoundToInt(fg), a); - int b = SkClampMax(SkScalarRoundToInt(fb), a); - - *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b); - } - } -} - -static void upScaleHorizTranspose(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) { - for (int y = 0 ; y < dst->height() ; y++) { - for (int x = 0 ; x < dst->width() ; x++) { - float sx = (y + 0.5f) / scale - 0.5f; - int x0 = SkClampMax(sk_float_ceil2int(sx-filter->width()), src->width()-1); - int x1 = SkClampMax(sk_float_floor2int(sx+filter->width()), src->width()-1); - - SkScalar totalWeight = 0; - SkScalar fr = 0, fg = 0, fb = 0, fa = 0; - - for (int srcX = x0 ; srcX <= x1 ; srcX++) { - SkScalar weight = filter->lookupScalar(sx - srcX); - SkPMColor c = *src->getAddr32(srcX, x); - fr += SkScalarMul(weight,SkGetPackedR32(c)); - fg += SkScalarMul(weight,SkGetPackedG32(c)); - fb += SkScalarMul(weight,SkGetPackedB32(c)); - fa += SkScalarMul(weight,SkGetPackedA32(c)); - totalWeight += weight; - } - fr = SkScalarDiv(fr,totalWeight); - fg = SkScalarDiv(fg,totalWeight); - fb = SkScalarDiv(fb,totalWeight); - fa = SkScalarDiv(fa,totalWeight); - - int a = SkClampMax(SkScalarRoundToInt(fa), 255); - int r = SkClampMax(SkScalarRoundToInt(fr), a); - int g = SkClampMax(SkScalarRoundToInt(fg), a); - int b = SkClampMax(SkScalarRoundToInt(fb), a); - - *dst->getAddr32(x,y) = SkPackARGB32(a, r, g, b); - } - } -} - -static void downScaleHoriz(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) { - SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4); - SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height()); - - SkAutoTDeleteArray<SkScalar> ada1(sums); - SkAutoTDeleteArray<SkScalar> ada2(weights); - - memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4); - memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar)); - - for (int y = 0 ; y < src->height() ; y++) { - for (int x = 0 ; x < src->width() ; x++) { - // splat each source pixel into the destination image - float dx = (x + 0.5f) * scale - 0.5f; - int x0 = SkClampMax(sk_float_ceil2int(dx-filter->width()), dst->width()-1); - int x1 = SkClampMax(sk_float_floor2int(dx+filter->width()), dst->width()-1); - - SkPMColor c = *src->getAddr32(x,y); - - for (int dst_x = x0 ; dst_x <= x1 ; dst_x++) { - SkScalar weight = filter->lookup(dx - dst_x); - sums[4*(y*dst->width() + dst_x) + 0] += weight*SkGetPackedR32(c); - sums[4*(y*dst->width() + dst_x) + 1] += weight*SkGetPackedG32(c); - sums[4*(y*dst->width() + dst_x) + 2] += weight*SkGetPackedB32(c); - sums[4*(y*dst->width() + dst_x) + 3] += weight*SkGetPackedA32(c); - weights[y*dst->width() + dst_x] += weight; - } - } - } - - divideByWeights(sums, weights, dst); -} - -static void downScaleVert(const SkBitmap *src, SkBitmap *dst, float scale, SkBitmapFilter *filter) { - SkScalar *sums = SkNEW_ARRAY(SkScalar, dst->width() * dst->height() * 4); - SkScalar *weights = SkNEW_ARRAY(SkScalar, dst->width() * dst->height()); - - SkAutoTDeleteArray<SkScalar> ada1(sums); - SkAutoTDeleteArray<SkScalar> ada2(weights); - - memset(sums, 0, dst->width() * dst->height() * sizeof(SkScalar) * 4); - memset(weights, 0, dst->width() * dst->height() * sizeof(SkScalar)); - - for (int y = 0 ; y < src->height() ; y++) { - for (int x = 0 ; x < src->width() ; x++) { - // splat each source pixel into the destination image - float dy = (y + 0.5f) * scale - 0.5f; - int y0 = SkClampMax(sk_float_ceil2int(dy-filter->width()), dst->height()-1); - int y1 = SkClampMax(sk_float_ceil2int(dy+filter->width()), dst->height()-1); - - SkPMColor c = *src->getAddr32(x,y); - - for (int dst_y = y0 ; dst_y <= y1 ; dst_y++) { - SkScalar weight = filter->lookupScalar(dy - dst_y); - sums[4*(dst_y*dst->width() + x) + 0] += weight*SkGetPackedR32(c); - sums[4*(dst_y*dst->width() + x) + 1] += weight*SkGetPackedG32(c); - sums[4*(dst_y*dst->width() + x) + 2] += weight*SkGetPackedB32(c); - sums[4*(dst_y*dst->width() + x) + 3] += weight*SkGetPackedA32(c); - weights[dst_y*dst->width() + x] += weight; - } - } - } - - divideByWeights(sums, weights, dst); -} - -void SkBitmap::scale(SkBitmap *dst) const { - - SkBitmap horizTemp; - - horizTemp.setConfig(SkBitmap::kARGB_8888_Config, height(), dst->width()); - horizTemp.allocPixels(); - - SkBitmapFilter *filter = allocateBitmapFilter(); - - float horizScale = float(dst->width()) / width(); - - if (horizScale >= 1) { - upScaleHorizTranspose(this, &horizTemp, horizScale, filter); - } else if (horizScale < 1) { - downScaleHoriz(this, &horizTemp, horizScale, filter); - } - - float vertScale = float(dst->height()) / height(); - - if (vertScale >= 1) { - upScaleHorizTranspose(&horizTemp, dst, vertScale, filter); - } else if (vertScale < 1) { - downScaleVert(&horizTemp, dst, vertScale, filter); - } - - SkDELETE(filter); -} diff --git a/src/core/SkBitmapFilter.h b/src/core/SkBitmapFilter.h index 38c2448c69..6a9e3d7c01 100644 --- a/src/core/SkBitmapFilter.h +++ b/src/core/SkBitmapFilter.h @@ -26,28 +26,30 @@ class SkBitmapFilter { fLookupMultiplier = this->invWidth() * (SKBITMAP_FILTER_TABLE_SIZE-1); } - SkFixed lookup( float x ) const { + SkFixed lookup(float x) const { if (!fPrecomputed) { precomputeTable(); } int filter_idx = int(sk_float_abs(x * fLookupMultiplier)); SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE); - return fFilterTable[ filter_idx ]; + return fFilterTable[filter_idx]; } - SkScalar lookupScalar( float x ) const { + SkScalar lookupScalar(float x) const { if (!fPrecomputed) { precomputeTable(); } int filter_idx = int(sk_float_abs(x * fLookupMultiplier)); SkASSERT(filter_idx < SKBITMAP_FILTER_TABLE_SIZE); - return fFilterTableScalar[ filter_idx ]; + return fFilterTableScalar[filter_idx]; } float width() const { return fWidth; } float invWidth() const { return fInvWidth; } virtual float evaluate(float x) const = 0; virtual ~SkBitmapFilter() {} + + static SkBitmapFilter* Allocate(); protected: float fWidth; float fInvWidth; @@ -126,29 +128,47 @@ class SkBoxFilter: public SkBitmapFilter { } virtual float evaluate(float x) const SK_OVERRIDE { - return 1; + return (x >= -fWidth && x < fWidth) ? 1.0f : 0.0f; } protected: }; +class SkHammingFilter: public SkBitmapFilter { +public: + SkHammingFilter(float width=1.f) + : SkBitmapFilter(width) { + } + virtual float evaluate(float x) const SK_OVERRIDE { + if (x <= -fWidth || x >= fWidth) { + return 0.0f; // Outside of the window. + } + if (x > -FLT_EPSILON && x < FLT_EPSILON) { + return 1.0f; // Special case the sinc discontinuity at the origin. + } + const float xpi = x * static_cast<float>(M_PI); + + return ((sk_float_sin(xpi) / xpi) * // sinc(x) + (0.54f + 0.46f * sk_float_cos(xpi / fWidth))); // hamming(x) + } +}; -class SkSincFilter: public SkBitmapFilter { +class SkLanczosFilter: public SkBitmapFilter { public: - SkSincFilter(float t, float width=3.f) - : SkBitmapFilter(width), tau(t) { + SkLanczosFilter(float width=3.f) + : SkBitmapFilter(width) { } virtual float evaluate(float x) const SK_OVERRIDE { - x = sk_float_abs(x * fInvWidth); - if (x < 1e-5f) return 1.f; - if (x > 1.f) return 0.f; - x *= SK_ScalarPI; - float sinc = sk_float_sin(x) / x; - float lanczos = sk_float_sin(x * tau) / (x * tau); - return sinc * lanczos; - } - protected: - float tau; + if (x <= -fWidth || x >= fWidth) { + return 0.0f; // Outside of the window. + } + if (x > -FLT_EPSILON && x < FLT_EPSILON) { + return 1.0f; // Special case the discontinuity at the origin. + } + float xpi = x * static_cast<float>(M_PI); + return (sk_float_sin(xpi) / xpi) * // sinc(x) + sk_float_sin(xpi / fWidth) / (xpi / fWidth); // sinc(x/fWidth) + } }; diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index a8a9b03d9a..57af144034 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -11,6 +11,7 @@ #include "SkPaint.h" #include "SkShader.h" // for tilemodes #include "SkUtilsArm.h" +#include "SkBitmapScaler.h" #if !SK_ARM_NEON_IS_NONE // These are defined in src/opts/SkBitmapProcState_arm_neon.cpp @@ -99,23 +100,45 @@ void SkBitmapProcState::possiblyScaleImage() { if (fFilterQuality != kHQ_BitmapFilter) { return; } - - // STEP 1: UPSAMPLE? - - // Check to see if the transformation matrix is scaling up, and if - // the matrix is simple, and if we're doing high quality scaling. - // If so, do the bitmap scale here and remove the scaling component from the matrix. - - if (fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) && - (fInvMatrix.getScaleX() < 1 || fInvMatrix.getScaleY() < 1) && + + // see if our platform has any specialized convolution code. + + + // Set up a pointer to a local (instead of storing the structure in the + // proc state) to avoid introducing a header dependency; this makes + // recompiles a lot less painful. + + SkConvolutionProcs simd; + fConvolutionProcs = &simd; + + fConvolutionProcs->fExtraHorizontalReads = 0; + fConvolutionProcs->fConvolveVertically = NULL; + fConvolutionProcs->fConvolve4RowsHorizontally = NULL; + fConvolutionProcs->fConvolveHorizontally = NULL; + fConvolutionProcs->fApplySIMDPadding = NULL; + + this->platformConvolutionProcs(); + + // STEP 1: Highest quality direct scale? + + // Check to see if the transformation matrix is simple, and if we're + // doing high quality scaling. If so, do the bitmap scale here and + // remove the scaling component from the matrix. + + if (fFilterQuality == kHQ_BitmapFilter && + fInvMatrix.getType() <= (SkMatrix::kScale_Mask | SkMatrix::kTranslate_Mask) && fOrigBitmap.config() == SkBitmap::kARGB_8888_Config) { - + + int dest_width = SkScalarCeilToInt(fOrigBitmap.width() / fInvMatrix.getScaleX()); + int dest_height = SkScalarCeilToInt(fOrigBitmap.height() / fInvMatrix.getScaleY()); + // All the criteria are met; let's make a new bitmap. - fScaledBitmap.setConfig(SkBitmap::kARGB_8888_Config, - (int)(fOrigBitmap.width() / fInvMatrix.getScaleX()), - (int)(fOrigBitmap.height() / fInvMatrix.getScaleY())); - fScaledBitmap.allocPixels(); - fOrigBitmap.scale(&fScaledBitmap); + + fScaledBitmap = SkBitmapScaler::Resize( fOrigBitmap, SkBitmapScaler::RESIZE_BEST, + dest_width, dest_height, fConvolutionProcs ); + + fScaledBitmap.lockPixels(); + fBitmap = &fScaledBitmap; // set the inv matrix type to translate-only; @@ -130,9 +153,9 @@ void SkBitmapProcState::possiblyScaleImage() { return; } - if (!fOrigBitmap.hasMipMap()) { + if (!fOrigBitmap.hasMipMap() && fFilterQuality != kNone_BitmapFilter) { - // STEP 2: DOWNSAMPLE + // STEP 2: MIPMAP DOWNSAMPLE? // Check to see if the transformation matrix is scaling *down*. // If so, automatically build mipmaps. diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h index a644dd1e02..3c8e346807 100644 --- a/src/core/SkBitmapProcState.h +++ b/src/core/SkBitmapProcState.h @@ -31,6 +31,7 @@ #endif class SkPaint; +class SkConvolutionProcs; struct SkBitmapProcState { @@ -59,7 +60,7 @@ struct SkBitmapProcState { const uint32_t[], int count, uint16_t colors[]); - + typedef U16CPU (*FixedTileProc)(SkFixed); // returns 0..0xFFFF typedef U16CPU (*FixedTileLowBitsProc)(SkFixed, int); // returns 0..0xF typedef U16CPU (*IntTileProc)(int value, int count); // returns 0..count-1 @@ -78,6 +79,8 @@ struct SkBitmapProcState { IntTileProc fIntTileProcY; // chooseProcs SkFixed fFilterOneX; SkFixed fFilterOneY; + + SkConvolutionProcs* fConvolutionProcs; // possiblyScaleImage SkPMColor fPaintPMColor; // chooseProcs - A8 config SkFixed fInvSx; // chooseProcs @@ -113,7 +116,12 @@ struct SkBitmapProcState { implementation can do nothing (see SkBitmapProcState_opts_none.cpp) */ void platformProcs(); - + + /** Platforms can also optionally overwrite the convolution functions + if we have SIMD versions of them. + */ + + void platformConvolutionProcs(); /** Given the byte size of the index buffer to be passed to the matrix proc, return the maximum number of resulting pixels that can be computed @@ -160,7 +168,7 @@ private: void possiblyScaleImage(); - SkBitmapFilter *fBitmapFilter; + SkBitmapFilter* fBitmapFilter; ShaderProc32 chooseBitmapFilterProc(); @@ -218,8 +226,6 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s, void S32_D16_filter_DX(const SkBitmapProcState& s, const uint32_t* xy, int count, uint16_t* colors); -void highQualityFilter_ScaleOnly(const SkBitmapProcState &s, int x, int y, - SkPMColor *SK_RESTRICT colors, int count); void highQualityFilter(const SkBitmapProcState &s, int x, int y, SkPMColor *SK_RESTRICT colors, int count); diff --git a/src/core/SkBitmapScaler.cpp b/src/core/SkBitmapScaler.cpp new file mode 100644 index 0000000000..7e840d2fdb --- /dev/null +++ b/src/core/SkBitmapScaler.cpp @@ -0,0 +1,315 @@ +#include "SkBitmapScaler.h" +#include "SkBitmapFilter.h" +#include "SkRect.h" +#include "SkTArray.h" +#include "SkErrorInternals.h" +#include "SkConvolver.h" + +// SkResizeFilter ---------------------------------------------------------------- + +// Encapsulates computation and storage of the filters required for one complete +// resize operation. +class SkResizeFilter { +public: + SkResizeFilter(SkBitmapScaler::ResizeMethod method, + int srcFullWidth, int srcFullHeight, + int destWidth, int destHeight, + const SkIRect& destSubset, + SkConvolutionProcs* convolveProcs); + ~SkResizeFilter() { + SkDELETE( fBitmapFilter ); + } + + // Returns the filled filter values. + const SkConvolutionFilter1D& xFilter() { return fXFilter; } + const SkConvolutionFilter1D& yFilter() { return fYFilter; } + +private: + + SkBitmapFilter* fBitmapFilter; + + // Computes one set of filters either horizontally or vertically. The caller + // will specify the "min" and "max" rather than the bottom/top and + // right/bottom so that the same code can be re-used in each dimension. + // + // |srcDependLo| and |srcDependSize| gives the range for the source + // depend rectangle (horizontally or vertically at the caller's discretion + // -- see above for what this means). + // + // Likewise, the range of destination values to compute and the scale factor + // for the transform is also specified. + + void computeFilters(int srcSize, + int destSubsetLo, int destSubsetSize, + float scale, + SkConvolutionFilter1D* output, + SkConvolutionProcs* convolveProcs); + + // Subset of scaled destination bitmap to compute. + SkIRect fOutBounds; + + SkConvolutionFilter1D fXFilter; + SkConvolutionFilter1D fYFilter; +}; + +SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method, + int srcFullWidth, int srcFullHeight, + int destWidth, int destHeight, + const SkIRect& destSubset, + SkConvolutionProcs* convolveProcs) + : fOutBounds(destSubset) { + + // method will only ever refer to an "algorithm method". + SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) && + (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD)); + + switch(method) { + case SkBitmapScaler::RESIZE_BOX: + fBitmapFilter = SkNEW(SkBoxFilter); + break; + case SkBitmapScaler::RESIZE_TRIANGLE: + fBitmapFilter = SkNEW(SkTriangleFilter); + break; + case SkBitmapScaler::RESIZE_MITCHELL: + fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f)); + break; + case SkBitmapScaler::RESIZE_HAMMING: + fBitmapFilter = SkNEW(SkHammingFilter); + break; + case SkBitmapScaler::RESIZE_LANCZOS3: + fBitmapFilter = SkNEW(SkLanczosFilter); + break; + default: + // NOTREACHED: + fBitmapFilter = SkNEW_ARGS(SkMitchellFilter, (1.f/3.f, 1.f/3.f)); + break; + } + + + float scaleX = static_cast<float>(destWidth) / + static_cast<float>(srcFullWidth); + float scaleY = static_cast<float>(destHeight) / + static_cast<float>(srcFullHeight); + + this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(), + scaleX, &fXFilter, convolveProcs); + this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(), + scaleY, &fYFilter, convolveProcs); +} + +// TODO(egouriou): Take advantage of periods in the convolution. +// Practical resizing filters are periodic outside of the border area. +// For Lanczos, a scaling by a (reduced) factor of p/q (q pixels in the +// source become p pixels in the destination) will have a period of p. +// A nice consequence is a period of 1 when downscaling by an integral +// factor. Downscaling from typical display resolutions is also bound +// to produce interesting periods as those are chosen to have multiple +// small factors. +// Small periods reduce computational load and improve cache usage if +// the coefficients can be shared. For periods of 1 we can consider +// loading the factors only once outside the borders. +void SkResizeFilter::computeFilters(int srcSize, + int destSubsetLo, int destSubsetSize, + float scale, + SkConvolutionFilter1D* output, + SkConvolutionProcs* convolveProcs) { + int destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi) + + // When we're doing a magnification, the scale will be larger than one. This + // means the destination pixels are much smaller than the source pixels, and + // that the range covered by the filter won't necessarily cover any source + // pixel boundaries. Therefore, we use these clamped values (max of 1) for + // some computations. + float clampedScale = SkTMin(1.0f, scale); + + // This is how many source pixels from the center we need to count + // to support the filtering function. + float srcSupport = fBitmapFilter->width() / clampedScale; + + // Speed up the divisions below by turning them into multiplies. + float invScale = 1.0f / scale; + + SkTArray<float> filterValues(64); + SkTArray<short> fixedFilterValues(64); + + // Loop over all pixels in the output range. We will generate one set of + // filter values for each one. Those values will tell us how to blend the + // source pixels to compute the destination pixel. + for (int destSubsetI = destSubsetLo; destSubsetI < destSubsetHi; + destSubsetI++) { + // Reset the arrays. We don't declare them inside so they can re-use the + // same malloc-ed buffer. + filterValues.reset(); + fixedFilterValues.reset(); + + // This is the pixel in the source directly under the pixel in the dest. + // Note that we base computations on the "center" of the pixels. To see + // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x + // downscale should "cover" the pixels around the pixel with *its center* + // at coordinates (2.5, 2.5) in the source, not those around (0, 0). + // Hence we need to scale coordinates (0.5, 0.5), not (0, 0). + float srcPixel = (static_cast<float>(destSubsetI) + 0.5f) * invScale; + + // Compute the (inclusive) range of source pixels the filter covers. + int srcBegin = SkTMax(0, SkScalarFloorToInt(srcPixel - srcSupport)); + int srcEnd = SkTMin(srcSize - 1, SkScalarCeilToInt(srcPixel + srcSupport)); + + // Compute the unnormalized filter value at each location of the source + // it covers. + float filterSum = 0.0f; // Sub of the filter values for normalizing. + for (int curFilterPixel = srcBegin; curFilterPixel <= srcEnd; + curFilterPixel++) { + // Distance from the center of the filter, this is the filter coordinate + // in source space. We also need to consider the center of the pixel + // when comparing distance against 'srcPixel'. In the 5x downscale + // example used above the distance from the center of the filter to + // the pixel with coordinates (2, 2) should be 0, because its center + // is at (2.5, 2.5). + float srcFilterDist = + ((static_cast<float>(curFilterPixel) + 0.5f) - srcPixel); + + // Since the filter really exists in dest space, map it there. + float destFilterDist = srcFilterDist * clampedScale; + + // Compute the filter value at that location. + float filterValue = fBitmapFilter->evaluate(destFilterDist); + filterValues.push_back(filterValue); + + filterSum += filterValue; + } + SkASSERT(!filterValues.empty()); + + // The filter must be normalized so that we don't affect the brightness of + // the image. Convert to normalized fixed point. + short fixedSum = 0; + for (int i = 0; i < filterValues.count(); i++) { + short curFixed = output->FloatToFixed(filterValues[i] / filterSum); + fixedSum += curFixed; + fixedFilterValues.push_back(curFixed); + } + + // The conversion to fixed point will leave some rounding errors, which + // we add back in to avoid affecting the brightness of the image. We + // arbitrarily add this to the center of the filter array (this won't always + // be the center of the filter function since it could get clipped on the + // edges, but it doesn't matter enough to worry about that case). + short leftovers = output->FloatToFixed(1.0f) - fixedSum; + fixedFilterValues[fixedFilterValues.count() / 2] += leftovers; + + // Now it's ready to go. + output->AddFilter(srcBegin, &fixedFilterValues[0], + static_cast<int>(fixedFilterValues.count())); + } + + if (convolveProcs->fApplySIMDPadding) { + convolveProcs->fApplySIMDPadding( output ); + } +} + +static SkBitmapScaler::ResizeMethod ResizeMethodToAlgorithmMethod( + SkBitmapScaler::ResizeMethod method) { + // Convert any "Quality Method" into an "Algorithm Method" + if (method >= SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD && + method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD) { + return method; + } + // The call to SkBitmapScalerGtv::Resize() above took care of + // GPU-acceleration in the cases where it is possible. So now we just + // pick the appropriate software method for each resize quality. + switch (method) { + // Users of RESIZE_GOOD are willing to trade a lot of quality to + // get speed, allowing the use of linear resampling to get hardware + // acceleration (SRB). Hence any of our "good" software filters + // will be acceptable, so we use a triangle. + case SkBitmapScaler::RESIZE_GOOD: + return SkBitmapScaler::RESIZE_TRIANGLE; + // Users of RESIZE_BETTER are willing to trade some quality in order + // to improve performance, but are guaranteed not to devolve to a linear + // resampling. In visual tests we see that Hamming-1 is not as good as + // Lanczos-2, however it is about 40% faster and Lanczos-2 itself is + // about 30% faster than Lanczos-3. The use of Hamming-1 has been deemed + // an acceptable trade-off between quality and speed. + case SkBitmapScaler::RESIZE_BETTER: + return SkBitmapScaler::RESIZE_HAMMING; + default: + return SkBitmapScaler::RESIZE_MITCHELL; + } +} + +// static +SkBitmap SkBitmapScaler::Resize(const SkBitmap& source, + ResizeMethod method, + int destWidth, int destHeight, + const SkIRect& destSubset, + SkConvolutionProcs* convolveProcs, + SkBitmap::Allocator* allocator) { + // Ensure that the ResizeMethod enumeration is sound. + SkASSERT(((RESIZE_FIRST_QUALITY_METHOD <= method) && + (method <= RESIZE_LAST_QUALITY_METHOD)) || + ((RESIZE_FIRST_ALGORITHM_METHOD <= method) && + (method <= RESIZE_LAST_ALGORITHM_METHOD))); + + SkIRect dest = { 0, 0, destWidth, destHeight }; + if (!dest.contains(destSubset)) { + SkErrorInternals::SetError( kInvalidArgument_SkError, + "Sorry, you passed me a bitmap resize " + " method I have never heard of: %d", + method ); + } + + // If the size of source or destination is 0, i.e. 0x0, 0xN or Nx0, just + // return empty. + if (source.width() < 1 || source.height() < 1 || + destWidth < 1 || destHeight < 1) { + return SkBitmap(); + } + + method = ResizeMethodToAlgorithmMethod(method); + + // Check that we deal with an "algorithm methods" from this point onward. + SkASSERT((SkBitmapScaler::RESIZE_FIRST_ALGORITHM_METHOD <= method) && + (method <= SkBitmapScaler::RESIZE_LAST_ALGORITHM_METHOD)); + + SkAutoLockPixels locker(source); + if (!source.readyToDraw() || source.config() != SkBitmap::kARGB_8888_Config) + return SkBitmap(); + + SkResizeFilter filter(method, source.width(), source.height(), + destWidth, destHeight, destSubset, convolveProcs); + + // Get a source bitmap encompassing this touched area. We construct the + // offsets and row strides such that it looks like a new bitmap, while + // referring to the old data. + const unsigned char* sourceSubset = + reinterpret_cast<const unsigned char*>(source.getPixels()); + + // Convolve into the result. + SkBitmap result; + result.setConfig(SkBitmap::kARGB_8888_Config, + destSubset.width(), destSubset.height()); + result.allocPixels(allocator, NULL); + if (!result.readyToDraw()) + return SkBitmap(); + + BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()), + !source.isOpaque(), filter.xFilter(), filter.yFilter(), + static_cast<int>(result.rowBytes()), + static_cast<unsigned char*>(result.getPixels()), + convolveProcs, true); + + // Preserve the "opaque" flag for use as an optimization later. + result.setIsOpaque(source.isOpaque()); + + return result; +} + +// static +SkBitmap SkBitmapScaler::Resize(const SkBitmap& source, + ResizeMethod method, + int destWidth, int destHeight, + SkConvolutionProcs* convolveProcs, + SkBitmap::Allocator* allocator) { + SkIRect destSubset = { 0, 0, destWidth, destHeight }; + return Resize(source, method, destWidth, destHeight, destSubset, + convolveProcs, allocator); +} diff --git a/src/core/SkBitmapScaler.h b/src/core/SkBitmapScaler.h new file mode 100644 index 0000000000..5682cc578d --- /dev/null +++ b/src/core/SkBitmapScaler.h @@ -0,0 +1,106 @@ +/* + * Copyright 2013 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkBitmapScaler_DEFINED +#define SkBitmapScaler_DEFINED + +#include "SkBitmap.h" +#include "SkConvolver.h" + +/** \class SkBitmapScaler + + Provides the interface for high quality image resampling. + */ + +class SK_API SkBitmapScaler { +public: + enum ResizeMethod { + // Quality Methods + // + // Those enumeration values express a desired quality/speed tradeoff. + // They are translated into an algorithm-specific method that depends + // on the capabilities (CPU, GPU) of the underlying platform. + // It is possible for all three methods to be mapped to the same + // algorithm on a given platform. + + // Good quality resizing. Fastest resizing with acceptable visual quality. + // This is typically intended for use during interactive layouts + // where slower platforms may want to trade image quality for large + // increase in resizing performance. + // + // For example the resizing implementation may devolve to linear + // filtering if this enables GPU acceleration to be used. + // + // Note that the underlying resizing method may be determined + // on the fly based on the parameters for a given resize call. + // For example an implementation using a GPU-based linear filter + // in the common case may still use a higher-quality software-based + // filter in cases where using the GPU would actually be slower - due + // to too much latency - or impossible - due to image format or size + // constraints. + RESIZE_GOOD, + + // Medium quality resizing. Close to high quality resizing (better + // than linear interpolation) with potentially some quality being + // traded-off for additional speed compared to RESIZE_BEST. + // + // This is intended, for example, for generation of large thumbnails + // (hundreds of pixels in each dimension) from large sources, where + // a linear filter would produce too many artifacts but where + // a RESIZE_HIGH might be too costly time-wise. + RESIZE_BETTER, + + // High quality resizing. The algorithm is picked to favor image quality. + RESIZE_BEST, + + // + // Algorithm-specific enumerations + // + + // Box filter. This is a weighted average of all of the pixels touching + // the destination pixel. For enlargement, this is nearest neighbor. + // + // You probably don't want this, it is here for testing since it is easy to + // compute. Use RESIZE_LANCZOS3 instead. + RESIZE_BOX, + RESIZE_TRIANGLE, + RESIZE_LANCZOS3, + RESIZE_HAMMING, + RESIZE_MITCHELL, + + // enum aliases for first and last methods by algorithm or by quality. + RESIZE_FIRST_QUALITY_METHOD = RESIZE_GOOD, + RESIZE_LAST_QUALITY_METHOD = RESIZE_BEST, + RESIZE_FIRST_ALGORITHM_METHOD = RESIZE_BOX, + RESIZE_LAST_ALGORITHM_METHOD = RESIZE_MITCHELL, + }; + + // Resizes the given source bitmap using the specified resize method, so that + // the entire image is (dest_size) big. The dest_subset is the rectangle in + // this destination image that should actually be returned. + // + // The output image will be (dest_subset.width(), dest_subset.height()). This + // will save work if you do not need the entire bitmap. + // + // The destination subset must be smaller than the destination image. + static SkBitmap Resize(const SkBitmap& source, + ResizeMethod method, + int dest_width, int dest_height, + const SkIRect& dest_subset, + SkConvolutionProcs *convolveProcs = NULL, + SkBitmap::Allocator* allocator = NULL); + + // Alternate version for resizing and returning the entire bitmap rather than + // a subset. + static SkBitmap Resize(const SkBitmap& source, + ResizeMethod method, + int dest_width, int dest_height, + SkConvolutionProcs *convolveProcs = NULL, + SkBitmap::Allocator* allocator = NULL); +}; + +#endif diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp new file mode 100644 index 0000000000..54e46b63fe --- /dev/null +++ b/src/core/SkConvolver.cpp @@ -0,0 +1,473 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "SkConvolver.h" +#include "SkSize.h" +#include "SkTypes.h" + +namespace { + + // Converts the argument to an 8-bit unsigned value by clamping to the range + // 0-255. + inline unsigned char ClampTo8(int a) { + if (static_cast<unsigned>(a) < 256) { + return a; // Avoid the extra check in the common case. + } + if (a < 0) { + return 0; + } + return 255; + } + + // Takes the value produced by accumulating element-wise product of image with + // a kernel and brings it back into range. + // All of the filter scaling factors are in fixed point with kShiftBits bits of + // fractional part. + inline unsigned char BringBackTo8(int a, bool takeAbsolute) { + a >>= SkConvolutionFilter1D::kShiftBits; + if (takeAbsolute) { + a = abs(a); + } + return ClampTo8(a); + } + + // Stores a list of rows in a circular buffer. The usage is you write into it + // by calling AdvanceRow. It will keep track of which row in the buffer it + // should use next, and the total number of rows added. + class CircularRowBuffer { + public: + // The number of pixels in each row is given in |sourceRowPixelWidth|. + // The maximum number of rows needed in the buffer is |maxYFilterSize| + // (we only need to store enough rows for the biggest filter). + // + // We use the |firstInputRow| to compute the coordinates of all of the + // following rows returned by Advance(). + CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize, + int firstInputRow) + : fRowByteWidth(destRowPixelWidth * 4), + fNumRows(maxYFilterSize), + fNextRow(0), + fNextRowCoordinate(firstInputRow) { + fBuffer.reset(fRowByteWidth * maxYFilterSize); + fRowAddresses.reset(fNumRows); + } + + // Moves to the next row in the buffer, returning a pointer to the beginning + // of it. + unsigned char* advanceRow() { + unsigned char* row = &fBuffer[fNextRow * fRowByteWidth]; + fNextRowCoordinate++; + + // Set the pointer to the next row to use, wrapping around if necessary. + fNextRow++; + if (fNextRow == fNumRows) { + fNextRow = 0; + } + return row; + } + + // Returns a pointer to an "unrolled" array of rows. These rows will start + // at the y coordinate placed into |*firstRowIndex| and will continue in + // order for the maximum number of rows in this circular buffer. + // + // The |firstRowIndex_| may be negative. This means the circular buffer + // starts before the top of the image (it hasn't been filled yet). + unsigned char* const* GetRowAddresses(int* firstRowIndex) { + // Example for a 4-element circular buffer holding coords 6-9. + // Row 0 Coord 8 + // Row 1 Coord 9 + // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10. + // Row 3 Coord 7 + // + // The "next" row is also the first (lowest) coordinate. This computation + // may yield a negative value, but that's OK, the math will work out + // since the user of this buffer will compute the offset relative + // to the firstRowIndex and the negative rows will never be used. + *firstRowIndex = fNextRowCoordinate - fNumRows; + + int curRow = fNextRow; + for (int i = 0; i < fNumRows; i++) { + fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth]; + + // Advance to the next row, wrapping if necessary. + curRow++; + if (curRow == fNumRows) { + curRow = 0; + } + } + return &fRowAddresses[0]; + } + + private: + // The buffer storing the rows. They are packed, each one fRowByteWidth. + SkTArray<unsigned char> fBuffer; + + // Number of bytes per row in the |buffer|. + int fRowByteWidth; + + // The number of rows available in the buffer. + int fNumRows; + + // The next row index we should write into. This wraps around as the + // circular buffer is used. + int fNextRow; + + // The y coordinate of the |fNextRow|. This is incremented each time a + // new row is appended and does not wrap. + int fNextRowCoordinate; + + // Buffer used by GetRowAddresses(). + SkTArray<unsigned char*> fRowAddresses; + }; + +// Convolves horizontally along a single row. The row data is given in +// |srcData| and continues for the numValues() of the filter. +template<bool hasAlpha> + void ConvolveHorizontally(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow) { + // Loop over each pixel on this row in the output image. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + // Apply the filter to the row to get the destination pixel in |accum|. + int accum[4] = {0}; + for (int filterX = 0; filterX < filterLength; filterX++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; + accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; + accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; + accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; + if (hasAlpha) { + accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of fractional part. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[outX * 4 + 0] = ClampTo8(accum[0]); + outRow[outX * 4 + 1] = ClampTo8(accum[1]); + outRow[outX * 4 + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + outRow[outX * 4 + 3] = ClampTo8(accum[3]); + } + } + } + +// Does vertical convolution to produce one output row. The filter values and +// length are given in the first two parameters. These are applied to each +// of the rows pointed to in the |sourceDataRows| array, with each row +// being |pixelWidth| wide. +// +// The output must have room for |pixelWidth * 4| bytes. +template<bool hasAlpha> + void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + // We go through each column in the output and do a vertical convolution, + // generating one output pixel each time. + for (int outX = 0; outX < pixelWidth; outX++) { + // Compute the number of bytes over in each row that the current column + // we're convolving starts at. The pixel will cover the next 4 bytes. + int byteOffset = outX * 4; + + // Apply the filter to one column of pixels. + int accum[4] = {0}; + for (int filterY = 0; filterY < filterLength; filterY++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; + accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; + accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; + accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; + if (hasAlpha) { + accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of precision. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[byteOffset + 0] = ClampTo8(accum[0]); + outRow[byteOffset + 1] = ClampTo8(accum[1]); + outRow[byteOffset + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + unsigned char alpha = ClampTo8(accum[3]); + + // Make sure the alpha channel doesn't come out smaller than any of the + // color channels. We use premultipled alpha channels, so this should + // never happen, but rounding errors will cause this from time to time. + // These "impossible" colors will cause overflows (and hence random pixel + // values) when the resulting bitmap is drawn to the screen. + // + // We only need to do this when generating the final output row (here). + int maxColorChannel = SkTMax(outRow[byteOffset + 0], + SkTMax(outRow[byteOffset + 1], + outRow[byteOffset + 2])); + if (alpha < maxColorChannel) { + outRow[byteOffset + 3] = maxColorChannel; + } else { + outRow[byteOffset + 3] = alpha; + } + } else { + // No alpha channel, the image is opaque. + outRow[byteOffset + 3] = 0xff; + } + } + } + + void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow, + bool sourceHasAlpha) { + if (sourceHasAlpha) { + ConvolveVertically<true>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } else { + ConvolveVertically<false>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } + } + +} // namespace + +// SkConvolutionFilter1D --------------------------------------------------------- + +SkConvolutionFilter1D::SkConvolutionFilter1D() +: fMaxFilter(0) { +} + +SkConvolutionFilter1D::~SkConvolutionFilter1D() { +} + +void SkConvolutionFilter1D::AddFilter(int filterOffset, + const float* filterValues, + int filterLength) { + SkASSERT(filterLength > 0); + + SkTArray<ConvolutionFixed> fixedValues; + fixedValues.reset(filterLength); + + for (int i = 0; i < filterLength; ++i) { + fixedValues.push_back(FloatToFixed(filterValues[i])); + } + + AddFilter(filterOffset, &fixedValues[0], filterLength); +} + +void SkConvolutionFilter1D::AddFilter(int filterOffset, + const ConvolutionFixed* filterValues, + int filterLength) { + // It is common for leading/trailing filter values to be zeros. In such + // cases it is beneficial to only store the central factors. + // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on + // a 1080p image this optimization gives a ~10% speed improvement. + int filterSize = filterLength; + int firstNonZero = 0; + while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) { + firstNonZero++; + } + + if (firstNonZero < filterLength) { + // Here we have at least one non-zero factor. + int lastNonZero = filterLength - 1; + while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) { + lastNonZero--; + } + + filterOffset += firstNonZero; + filterLength = lastNonZero + 1 - firstNonZero; + SkASSERT(filterLength > 0); + + for (int i = firstNonZero; i <= lastNonZero; i++) { + fFilterValues.push_back(filterValues[i]); + } + } else { + // Here all the factors were zeroes. + filterLength = 0; + } + + FilterInstance instance; + + // We pushed filterLength elements onto fFilterValues + instance.fDataLocation = (static_cast<int>(fFilterValues.count()) - + filterLength); + instance.fOffset = filterOffset; + instance.fTrimmedLength = filterLength; + instance.fLength = filterSize; + fFilters.push_back(instance); + + fMaxFilter = SkTMax(fMaxFilter, filterLength); +} + +const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter( + int* specifiedFilterlength, + int* filterOffset, + int* filterLength) const { + const FilterInstance& filter = fFilters[0]; + *filterOffset = filter.fOffset; + *filterLength = filter.fTrimmedLength; + *specifiedFilterlength = filter.fLength; + if (filter.fTrimmedLength == 0) { + return NULL; + } + + return &fFilterValues[filter.fDataLocation]; +} + +void BGRAConvolve2D(const unsigned char* sourceData, + int sourceByteRowStride, + bool sourceHasAlpha, + const SkConvolutionFilter1D& filterX, + const SkConvolutionFilter1D& filterY, + int outputByteRowStride, + unsigned char* output, + SkConvolutionProcs* convolveProcs, + bool useSimdIfPossible) { + + int maxYFilterSize = filterY.maxFilter(); + + // The next row in the input that we will generate a horizontally + // convolved row for. If the filter doesn't start at the beginning of the + // image (this is the case when we are only resizing a subset), then we + // don't want to generate any output rows before that. Compute the starting + // row for convolution as the first pixel for the first vertical filter. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filterY.FilterForValue(0, &filterOffset, &filterLength); + int nextXRow = filterOffset; + + // We loop over each row in the input doing a horizontal convolution. This + // will result in a horizontally convolved image. We write the results into + // a circular buffer of convolved rows and do vertical convolution as rows + // are available. This prevents us from having to store the entire + // intermediate image and helps cache coherency. + // We will need four extra rows to allow horizontal convolution could be done + // simultaneously. We also pad each row in row buffer to be aligned-up to + // 16 bytes. + // TODO(jiesun): We do not use aligned load from row buffer in vertical + // convolution pass yet. Somehow Windows does not like it. + int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; + int rowBufferHeight = maxYFilterSize + + (convolveProcs->fConvolve4RowsHorizontally ? 4 : 0); + CircularRowBuffer rowBuffer(rowBufferWidth, + rowBufferHeight, + filterOffset); + + // Loop over every possible output row, processing just enough horizontal + // convolutions to run each subsequent vertical convolution. + SkASSERT(outputByteRowStride >= filterX.numValues() * 4); + int numOutputRows = filterY.numValues(); + + // We need to check which is the last line to convolve before we advance 4 + // lines in one iteration. + int lastFilterOffset, lastFilterLength; + + // SSE2 can access up to 3 extra pixels past the end of the + // buffer. At the bottom of the image, we have to be careful + // not to access data past the end of the buffer. Normally + // we fall back to the C++ implementation for the last row. + // If the last row is less than 3 pixels wide, we may have to fall + // back to the C++ version for more rows. Compute how many + // rows we need to avoid the SSE implementation for here. + filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset, + &lastFilterLength); + int avoidSimdRows = 1 + convolveProcs->fExtraHorizontalReads / + (lastFilterOffset + lastFilterLength); + + filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, + &lastFilterLength); + + for (int outY = 0; outY < numOutputRows; outY++) { + filterValues = filterY.FilterForValue(outY, + &filterOffset, &filterLength); + + // Generate output rows until we have enough to run the current filter. + while (nextXRow < filterOffset + filterLength) { + if (convolveProcs->fConvolve4RowsHorizontally && + nextXRow + 3 < lastFilterOffset + lastFilterLength - + avoidSimdRows) { + const unsigned char* src[4]; + unsigned char* outRow[4]; + for (int i = 0; i < 4; ++i) { + src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride]; + outRow[i] = rowBuffer.advanceRow(); + } + convolveProcs->fConvolve4RowsHorizontally(src, filterX, outRow); + nextXRow += 4; + } else { + // Check if we need to avoid SSE2 for this row. + if (convolveProcs->fConvolveHorizontally && + nextXRow < lastFilterOffset + lastFilterLength - + avoidSimdRows) { + convolveProcs->fConvolveHorizontally( + &sourceData[nextXRow * sourceByteRowStride], + filterX, rowBuffer.advanceRow(), sourceHasAlpha); + } else { + if (sourceHasAlpha) { + ConvolveHorizontally<true>( + &sourceData[nextXRow * sourceByteRowStride], + filterX, rowBuffer.advanceRow()); + } else { + ConvolveHorizontally<false>( + &sourceData[nextXRow * sourceByteRowStride], + filterX, rowBuffer.advanceRow()); + } + } + nextXRow++; + } + } + + // Compute where in the output image this row of final data will go. + unsigned char* curOutputRow = &output[outY * outputByteRowStride]; + + // Get the list of rows that the circular buffer has, in order. + int firstRowInCircularBuffer; + unsigned char* const* rowsToConvolve = + rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); + + // Now compute the start of the subset of those rows that the filter + // needs. + unsigned char* const* firstRowForFilter = + &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; + + if (convolveProcs->fConvolveVertically) { + convolveProcs->fConvolveVertically(filterValues, filterLength, + firstRowForFilter, + filterX.numValues(), curOutputRow, + sourceHasAlpha); + } else { + ConvolveVertically(filterValues, filterLength, + firstRowForFilter, + filterX.numValues(), curOutputRow, + sourceHasAlpha); + } + } +} diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h new file mode 100644 index 0000000000..a2758e57a8 --- /dev/null +++ b/src/core/SkConvolver.h @@ -0,0 +1,203 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SK_CONVOLVER_H +#define SK_CONVOLVER_H + +#include "SkSize.h" +#include "SkTypes.h" +#include "SkTArray.h" + +// avoid confusion with Mac OS X's math library (Carbon) +#if defined(__APPLE__) +#undef FloatToConvolutionFixed +#undef ConvolutionFixedToFloat +#endif + +// Represents a filter in one dimension. Each output pixel has one entry in this +// object for the filter values contributing to it. You build up the filter +// list by calling AddFilter for each output pixel (in order). +// +// We do 2-dimensional convolution by first convolving each row by one +// SkConvolutionFilter1D, then convolving each column by another one. +// +// Entries are stored in ConvolutionFixed point, shifted left by kShiftBits. +class SkConvolutionFilter1D { +public: + typedef short ConvolutionFixed; + + // The number of bits that ConvolutionFixed point values are shifted by. + enum { kShiftBits = 14 }; + + SK_API SkConvolutionFilter1D(); + SK_API ~SkConvolutionFilter1D(); + + // Convert between floating point and our ConvolutionFixed point representation. + static ConvolutionFixed FloatToFixed(float f) { + return static_cast<ConvolutionFixed>(f * (1 << kShiftBits)); + } + static unsigned char FixedToChar(ConvolutionFixed x) { + return static_cast<unsigned char>(x >> kShiftBits); + } + static float FixedToFloat(ConvolutionFixed x) { + // The cast relies on ConvolutionFixed being a short, implying that on + // the platforms we care about all (16) bits will fit into + // the mantissa of a (32-bit) float. + SK_COMPILE_ASSERT(sizeof(ConvolutionFixed) == 2, ConvolutionFixed_type_should_fit_in_float_mantissa); + float raw = static_cast<float>(x); + return ldexpf(raw, -kShiftBits); + } + + // Returns the maximum pixel span of a filter. + int maxFilter() const { return fMaxFilter; } + + // Returns the number of filters in this filter. This is the dimension of the + // output image. + int numValues() const { return static_cast<int>(fFilters.count()); } + + // Appends the given list of scaling values for generating a given output + // pixel. |filterOffset| is the distance from the edge of the image to where + // the scaling factors start. The scaling factors apply to the source pixels + // starting from this position, and going for the next |filterLength| pixels. + // + // You will probably want to make sure your input is normalized (that is, + // all entries in |filterValuesg| sub to one) to prevent affecting the overall + // brighness of the image. + // + // The filterLength must be > 0. + // + // This version will automatically convert your input to ConvolutionFixed point. + SK_API void AddFilter(int filterOffset, + const float* filterValues, + int filterLength); + + // Same as the above version, but the input is already ConvolutionFixed point. + void AddFilter(int filterOffset, + const ConvolutionFixed* filterValues, + int filterLength); + + // Retrieves a filter for the given |valueOffset|, a position in the output + // image in the direction we're convolving. The offset and length of the + // filter values are put into the corresponding out arguments (see AddFilter + // above for what these mean), and a pointer to the first scaling factor is + // returned. There will be |filterLength| values in this array. + inline const ConvolutionFixed* FilterForValue(int valueOffset, + int* filterOffset, + int* filterLength) const { + const FilterInstance& filter = fFilters[valueOffset]; + *filterOffset = filter.fOffset; + *filterLength = filter.fTrimmedLength; + if (filter.fTrimmedLength == 0) { + return NULL; + } + return &fFilterValues[filter.fDataLocation]; + } + + // Retrieves the filter for the offset 0, presumed to be the one and only. + // The offset and length of the filter values are put into the corresponding + // out arguments (see AddFilter). Note that |filterLegth| and + // |specifiedFilterLength| may be different if leading/trailing zeros of the + // original floating point form were clipped. + // There will be |filterLength| values in the return array. + // Returns NULL if the filter is 0-length (for instance when all floating + // point values passed to AddFilter were clipped to 0). + SK_API const ConvolutionFixed* GetSingleFilter(int* specifiedFilterLength, + int* filterOffset, + int* filterLength) const; + + // Add another value to the fFilterValues array -- useful for + // SIMD padding which happens outside of this class. + + void addFilterValue( ConvolutionFixed val ) { + fFilterValues.push_back( val ); + } +private: + struct FilterInstance { + // Offset within filterValues for this instance of the filter. + int fDataLocation; + + // Distance from the left of the filter to the center. IN PIXELS + int fOffset; + + // Number of values in this filter instance. + int fTrimmedLength; + + // Filter length as specified. Note that this may be different from + // 'trimmed_length' if leading/trailing zeros of the original floating + // point form were clipped differently on each tail. + int fLength; + }; + + // Stores the information for each filter added to this class. + SkTArray<FilterInstance> fFilters; + + // We store all the filter values in this flat list, indexed by + // |FilterInstance.data_location| to avoid the mallocs required for storing + // each one separately. + SkTArray<ConvolutionFixed> fFilterValues; + + // The maximum size of any filter we've added. + int fMaxFilter; +}; + +typedef void (*SkConvolveVertically_pointer)( + const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow, + bool hasAlpha); +typedef void (*SkConvolve4RowsHorizontally_pointer)( + const unsigned char* srcData[4], + const SkConvolutionFilter1D& filter, + unsigned char* outRow[4]); +typedef void (*SkConvolveHorizontally_pointer)( + const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool hasAlpha); +typedef void (*SkConvolveFilterPadding_pointer)( + SkConvolutionFilter1D* filter); + +struct SkConvolutionProcs { + // This is how many extra pixels may be read by the + // conolve*horizontally functions. + int fExtraHorizontalReads; + SkConvolveVertically_pointer fConvolveVertically; + SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally; + SkConvolveHorizontally_pointer fConvolveHorizontally; + SkConvolveFilterPadding_pointer fApplySIMDPadding; +}; + + + +// Does a two-dimensional convolution on the given source image. +// +// It is assumed the source pixel offsets referenced in the input filters +// reference only valid pixels, so the source image size is not required. Each +// row of the source image starts |sourceByteRowStride| after the previous +// one (this allows you to have rows with some padding at the end). +// +// The result will be put into the given output buffer. The destination image +// size will be xfilter.numValues() * yfilter.numValues() pixels. It will be +// in rows of exactly xfilter.numValues() * 4 bytes. +// +// |sourceHasAlpha| is a hint that allows us to avoid doing computations on +// the alpha channel if the image is opaque. If you don't know, set this to +// true and it will work properly, but setting this to false will be a few +// percent faster if you know the image is opaque. +// +// The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order +// (this is ARGB when loaded into 32-bit words on a little-endian machine). +SK_API void BGRAConvolve2D(const unsigned char* sourceData, + int sourceByteRowStride, + bool sourceHasAlpha, + const SkConvolutionFilter1D& xfilter, + const SkConvolutionFilter1D& yfilter, + int outputByteRowStride, + unsigned char* output, + SkConvolutionProcs* convolveProcs, + bool useSimdIfPossible); + +#endif // SK_CONVOLVER_H diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp index f992bcb636..95492c596d 100644 --- a/src/opts/SkBitmapFilter_opts_SSE2.cpp +++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp @@ -11,6 +11,7 @@ #include "SkColorPriv.h" #include "SkUnPreMultiply.h" #include "SkShader.h" +#include "SkConvolver.h" #include "SkBitmapFilter_opts_SSE2.h" @@ -180,3 +181,456 @@ void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, } } + +// Convolves horizontally along a single row. The row data is given in +// |src_data| and continues for the num_values() of the filter. +void convolveHorizontally_SSE2(const unsigned char* src_data, + const SkConvolutionFilter1D& filter, + unsigned char* out_row, + bool /*has_alpha*/) { + int num_values = filter.numValues(); + + int filter_offset, filter_length; + __m128i zero = _mm_setzero_si128(); + __m128i mask[4]; + // |mask| will be used to decimate all extra filter coefficients that are + // loaded by SIMD when |filter_length| is not divisible by 4. + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); + + // Output one pixel each iteration, calculating all channels (RGBA) together. + for (int out_x = 0; out_x < num_values; out_x++) { + const SkConvolutionFilter1D::ConvolutionFixed* filter_values = + filter.FilterForValue(out_x, &filter_offset, &filter_length); + + __m128i accum = _mm_setzero_si128(); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filter_length| pixels (4 bytes each) after this. + const __m128i* row_to_filter = + reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); + + // We will load and accumulate with four coefficients per iteration. + for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { + + // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. + __m128i coeff, coeff16; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + + // Load four pixels => unpack the first two pixels to 16 bits => + // multiply with coefficients => accumulate the convolution result. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src8 = _mm_loadu_si128(row_to_filter); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0*c0 b0*c0 g0*c0 r0*c0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a1*c1 b1*c1 g1*c1 r1*c1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Duplicate 3rd and 4th coefficients for all channels => + // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients + // => accumulate the convolution results. + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + // [16] a3 g3 b3 r3 a2 g2 b2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2*c2 b2*c2 g2*c2 r2*c2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a3*c3 b3*c3 g3*c3 r3*c3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Advance the pixel and coefficients pointers. + row_to_filter += 1; + filter_values += 4; + } + + // When |filter_length| is not divisible by 4, we need to decimate some of + // the filter coefficient that was loaded incorrectly to zero; Other than + // that the algorithm is same with above, exceot that the 4th pixel will be + // always absent. + int r = filter_length&3; + if (r) { + // Note: filter_values must be padded to align_up(filter_offset, 8). + __m128i coeff, coeff16; + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // Mask out extra filter taps. + coeff = _mm_and_si128(coeff, mask[r]); + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + + // Note: line buffer must be padded to align_up(filter_offset, 16). + // We resolve this by use C-version for the last horizontal line. + __m128i src8 = _mm_loadu_si128(row_to_filter); + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + src16 = _mm_unpackhi_epi8(src8, zero); + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + } + + // Shift right for fixed point implementation. + accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + accum = _mm_packs_epi32(accum, zero); + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + accum = _mm_packus_epi16(accum, zero); + + // Store the pixel value of 32 bits. + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); + out_row += 4; + } +} + +// Convolves horizontally along four rows. The row data is given in +// |src_data| and continues for the num_values() of the filter. +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please +// refer to that function for detailed comments. +void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], + const SkConvolutionFilter1D& filter, + unsigned char* out_row[4]) { + int num_values = filter.numValues(); + + int filter_offset, filter_length; + __m128i zero = _mm_setzero_si128(); + __m128i mask[4]; + // |mask| will be used to decimate all extra filter coefficients that are + // loaded by SIMD when |filter_length| is not divisible by 4. + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); + + // Output one pixel each iteration, calculating all channels (RGBA) together. + for (int out_x = 0; out_x < num_values; out_x++) { + const SkConvolutionFilter1D::ConvolutionFixed* filter_values = + filter.FilterForValue(out_x, &filter_offset, &filter_length); + + // four pixels in a column per iteration. + __m128i accum0 = _mm_setzero_si128(); + __m128i accum1 = _mm_setzero_si128(); + __m128i accum2 = _mm_setzero_si128(); + __m128i accum3 = _mm_setzero_si128(); + int start = (filter_offset<<2); + // We will load and accumulate with four coefficients per iteration. + for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { + __m128i coeff, coeff16lo, coeff16hi; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); + + __m128i src8, src16, mul_hi, mul_lo, t; + +#define ITERATION(src, accum) \ + src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ + src16 = _mm_unpacklo_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ + mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + src16 = _mm_unpackhi_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ + mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t) + + ITERATION(src_data[0] + start, accum0); + ITERATION(src_data[1] + start, accum1); + ITERATION(src_data[2] + start, accum2); + ITERATION(src_data[3] + start, accum3); + + start += 16; + filter_values += 4; + } + + int r = filter_length & 3; + if (r) { + // Note: filter_values must be padded to align_up(filter_offset, 8); + __m128i coeff; + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); + // Mask out extra filter taps. + coeff = _mm_and_si128(coeff, mask[r]); + + __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + /* c1 c1 c1 c1 c0 c0 c0 c0 */ + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); + __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); + + __m128i src8, src16, mul_hi, mul_lo, t; + + ITERATION(src_data[0] + start, accum0); + ITERATION(src_data[1] + start, accum1); + ITERATION(src_data[2] + start, accum2); + ITERATION(src_data[3] + start, accum3); + } + + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum0 = _mm_packs_epi32(accum0, zero); + accum0 = _mm_packus_epi16(accum0, zero); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_packs_epi32(accum1, zero); + accum1 = _mm_packus_epi16(accum1, zero); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_packs_epi32(accum2, zero); + accum2 = _mm_packus_epi16(accum2, zero); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_packs_epi32(accum3, zero); + accum3 = _mm_packus_epi16(accum3, zero); + + *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); + *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); + *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); + *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); + + out_row[0] += 4; + out_row[1] += 4; + out_row[2] += 4; + out_row[3] += 4; + } +} + +// Does vertical convolution to produce one output row. The filter values and +// length are given in the first two parameters. These are applied to each +// of the rows pointed to in the |source_data_rows| array, with each row +// being |pixel_width| wide. +// +// The output must have room for |pixel_width * 4| bytes. +template<bool has_alpha> +void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, + int filter_length, + unsigned char* const* source_data_rows, + int pixel_width, + unsigned char* out_row) { + int width = pixel_width & ~3; + + __m128i zero = _mm_setzero_si128(); + __m128i accum0, accum1, accum2, accum3, coeff16; + const __m128i* src; + // Output four pixels per iteration (16 bytes). + for (int out_x = 0; out_x < width; out_x += 4) { + + // Accumulated result for each pixel. 32 bits per RGBA channel. + accum0 = _mm_setzero_si128(); + accum1 = _mm_setzero_si128(); + accum2 = _mm_setzero_si128(); + accum3 = _mm_setzero_si128(); + + // Convolve with one filter coefficient per iteration. + for (int filter_y = 0; filter_y < filter_length; filter_y++) { + + // Duplicate the filter coefficient 8 times. + // [16] cj cj cj cj cj cj cj cj + coeff16 = _mm_set1_epi16(filter_values[filter_y]); + + // Load four pixels (16 bytes) together. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + src = reinterpret_cast<const __m128i*>( + &source_data_rows[filter_y][out_x << 2]); + __m128i src8 = _mm_loadu_si128(src); + + // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + + // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + // [32] a3 b3 g3 r3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum3 = _mm_add_epi32(accum3, t); + } + + // Shift right for fixed point implementation. + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, accum3); + + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + + if (has_alpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum0 = _mm_max_epu8(b, accum0); + } else { + // Set value of alpha channels to 0xFF. + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } + + // Store the convolution result (16 bytes) and advance the pixel pointers. + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); + out_row += 16; + } + + // When the width of the output is not divisible by 4, We need to save one + // pixel (4 bytes) each time. And also the fourth pixel is always absent. + if (pixel_width & 3) { + accum0 = _mm_setzero_si128(); + accum1 = _mm_setzero_si128(); + accum2 = _mm_setzero_si128(); + for (int filter_y = 0; filter_y < filter_length; ++filter_y) { + coeff16 = _mm_set1_epi16(filter_values[filter_y]); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + src = reinterpret_cast<const __m128i*>( + &source_data_rows[filter_y][width<<2]); + __m128i src8 = _mm_loadu_si128(src); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + } + + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, zero); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + if (has_alpha) { + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + accum0 = _mm_max_epu8(b, accum0); + } else { + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } + + for (int out_x = width; out_x < pixel_width; out_x++) { + *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); + accum0 = _mm_srli_si128(accum0, 4); + out_row += 4; + } + } +} + +void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, + int filter_length, + unsigned char* const* source_data_rows, + int pixel_width, + unsigned char* out_row, + bool has_alpha) { + if (has_alpha) { + convolveVertically_SSE2<true>(filter_values, + filter_length, + source_data_rows, + pixel_width, + out_row); + } else { + convolveVertically_SSE2<false>(filter_values, + filter_length, + source_data_rows, + pixel_width, + out_row); + } +} + +void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { + // Padding |paddingCount| of more dummy coefficients after the coefficients + // of last filter to prevent SIMD instructions which load 8 or 16 bytes + // together to access invalid memory areas. We are not trying to align the + // coefficients right now due to the opaqueness of <vector> implementation. + // This has to be done after all |AddFilter| calls. + for (int i = 0; i < 8; ++i) { + filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0)); + } +} diff --git a/src/opts/SkBitmapFilter_opts_SSE2.h b/src/opts/SkBitmapFilter_opts_SSE2.h index c511acc83a..588f4ef18b 100644 --- a/src/opts/SkBitmapFilter_opts_SSE2.h +++ b/src/opts/SkBitmapFilter_opts_SSE2.h @@ -11,10 +11,27 @@ #define SkBitmapFilter_opts_sse2_DEFINED #include "SkBitmapProcState.h" +#include "SkConvolver.h" void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y, SkPMColor *SK_RESTRICT colors, int count); void highQualityFilter_SSE2(const SkBitmapProcState &s, int x, int y, SkPMColor *SK_RESTRICT colors, int count); + +void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, + int filter_length, + unsigned char* const* source_data_rows, + int pixel_width, + unsigned char* out_row, + bool has_alpha); +void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], + const SkConvolutionFilter1D& filter, + unsigned char* out_row[4]); +void convolveHorizontally_SSE2(const unsigned char* src_data, + const SkConvolutionFilter1D& filter, + unsigned char* out_row, + bool has_alpha); +void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter); + #endif diff --git a/src/opts/SkBitmapProcState_opts_none.cpp b/src/opts/SkBitmapProcState_opts_none.cpp index 3a186b5bfe..62af6d0f83 100644 --- a/src/opts/SkBitmapProcState_opts_none.cpp +++ b/src/opts/SkBitmapProcState_opts_none.cpp @@ -21,3 +21,6 @@ // empty implementation just uses default supplied function pointers void SkBitmapProcState::platformProcs() {} + +// empty implementation just uses default supplied function pointers +void SkBitmapProcState::platformScaleProc() {} diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index 37ce9036ca..0bb450356d 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -107,6 +107,16 @@ static bool cachedHasSSSE3() { SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters"); +void SkBitmapProcState::platformConvolutionProcs() { + if (cachedHasSSE2()) { + fConvolutionProcs->fExtraHorizontalReads = 3; + fConvolutionProcs->fConvolveVertically = &convolveVertically_SSE2; + fConvolutionProcs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; + fConvolutionProcs->fConvolveHorizontally = &convolveHorizontally_SSE2; + fConvolutionProcs->fApplySIMDPadding = &applySIMDPadding_SSE2; + } +} + void SkBitmapProcState::platformProcs() { if (cachedHasSSSE3()) { #if !defined(SK_BUILD_FOR_ANDROID) @@ -151,9 +161,6 @@ void SkBitmapProcState::platformProcs() { if (fShaderProc32 == highQualityFilter) { fShaderProc32 = highQualityFilter_SSE2; } - if (fShaderProc32 == highQualityFilter_ScaleOnly) { - fShaderProc32 = highQualityFilter_ScaleOnly_SSE2; - } } } } |