diff options
author | qiankun.miao <qiankun.miao@intel.com> | 2014-09-03 08:24:12 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-09-03 08:24:12 -0700 |
commit | b381fa10d8079c58928058bb8a6db32b39f05e51 (patch) | |
tree | c5bf9b435e9841d3dd43756431b7d82eb50ecdad /src/opts | |
parent | b0bd4f64a6827dda6f4ec48e4746b0f0b72a975f (diff) |
Enable highQualityFilter_SSE2
With SSE2, bitmap_BGRA_8888_A_scale_rotate_bicubic gains about 40%
performance improvement on desktop i7-3770.
BUG=skia:
R=mtklein@google.com, humper@google.com
Author: qiankun.miao@intel.com
Review URL: https://codereview.chromium.org/525283002
Diffstat (limited to 'src/opts')
-rw-r--r-- | src/opts/SkBitmapFilter_opts_SSE2.cpp | 56 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 2 |
2 files changed, 28 insertions, 30 deletions
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp index b831e298d0..5bf5326c9e 100644 --- a/src/opts/SkBitmapFilter_opts_SSE2.cpp +++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp @@ -46,45 +46,45 @@ static inline void print128f(__m128 value) { void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, SkPMColor* SK_RESTRICT colors, int count) { - const int maxX = s.fBitmap->width() - 1; - const int maxY = s.fBitmap->height() - 1; + const int maxX = s.fBitmap->width(); + const int maxY = s.fBitmap->height(); + SkAutoTMalloc<SkScalar> xWeights(maxX); while (count-- > 0) { SkPoint srcPt; - s.fInvProc(s.fInvMatrix, SkIntToScalar(x), - SkIntToScalar(y), &srcPt); + s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt); srcPt.fX -= SK_ScalarHalf; srcPt.fY -= SK_ScalarHalf; - int sx = SkScalarFloorToInt(srcPt.fX); - int sy = SkScalarFloorToInt(srcPt.fY); - __m128 weight = _mm_setzero_ps(); __m128 accum = _mm_setzero_ps(); - int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f))); - int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f))); - int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f))); - int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f))); + int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY); + int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY); + int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX); + int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX); + + for (int srcX = x0; srcX < x1 ; srcX++) { + // Looking these up once instead of each loop is a ~15% speedup. + xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX)); + } - for (int src_y = y0; src_y <= y1; src_y++) { - float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fY - src_y)); + for (int srcY = y0; srcY < y1; srcY++) { + SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY)); - for (int src_x = x0; src_x <= x1 ; src_x++) { - float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fX - src_x)); + for (int srcX = x0; srcX < x1 ; srcX++) { + SkScalar xWeight = xWeights[srcX - x0]; - float combined_weight = xweight * yweight; + SkScalar combined_weight = SkScalarMul(xWeight, yWeight); - SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y); + SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY); - __m128i c = _mm_cvtsi32_si128( color ); + __m128i c = _mm_cvtsi32_si128(color); c = _mm_unpacklo_epi8(c, _mm_setzero_si128()); c = _mm_unpacklo_epi16(c, _mm_setzero_si128()); - - __m128 cfloat = _mm_cvtepi32_ps( c ); + __m128 cfloat = _mm_cvtepi32_ps(c); __m128 weightVector = _mm_set1_ps(combined_weight); - accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector)); weight = _mm_add_ps( weight, weightVector ); } @@ -92,15 +92,13 @@ void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y, accum = _mm_div_ps(accum, weight); accum = _mm_add_ps(accum, _mm_set1_ps(0.5f)); + __m128i accumInt = _mm_cvttps_epi32(accum); - __m128i accumInt = _mm_cvtps_epi32( accum ); - - int localResult[4]; - _mm_storeu_si128((__m128i *) (localResult), accumInt); - int a = SkClampMax(localResult[0], 255); - int r = SkClampMax(localResult[1], a); - int g = SkClampMax(localResult[2], a); - int b = SkClampMax(localResult[3], a); + int* localResult = (int*)(&accumInt); + int a = SkClampMax(localResult[3], 255); + int r = SkClampMax(localResult[2], a); + int g = SkClampMax(localResult[1], a); + int b = SkClampMax(localResult[0], a); *colors++ = SkPackARGB32(a, r, g, b); diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 5bab17aa00..55eb843312 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -123,7 +123,7 @@ static inline bool supports_simd(int minLevel) { //////////////////////////////////////////////////////////////////////////////// -SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters"); +SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", true, "Use SSE optimized version of high quality image filters"); void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) { if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { |