diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-04-14 14:54:22 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-04-14 14:54:22 +0000 |
commit | 54299654e964ab53189f774e30bce2adebbdc857 (patch) | |
tree | 594ba2f9e2e02e770b8932c10921df863f9c84f7 /src | |
parent | 282333ffa2f0f343b4bde51af9be5720184c872a (diff) |
Xfermode: SSE2 implementation of a number of simple transfer modes
These modes share some common code and not very complex, so group them
together. This CL yields about 50% performance improvement on desktop
i7-3770. Here are the data:
before:
Xfermode_Screen 8888: cmsecs = 30.25 565: cmsecs = 46.81
Xfermode_Modulate 8888: cmsecs = 22.48 565: cmsecs = 40.06
Xfermode_Plus 8888: cmsecs = 21.04 565: cmsecs = 37.51
Xfermode_Xor 8888: cmsecs = 37.18 565: cmsecs = 52.53
Xfermode_DstATop 8888: cmsecs = 28.97 565: cmsecs = 46.42
Xfermode_SrcATop 8888: cmsecs = 29.74 565: cmsecs = 46.25
Xfermode_DstOut 8888: cmsecs = 5.34 565: cmsecs = 24.53
Xfermode_SrcOut 8888: cmsecs = 12.25 565: cmsecs = 24.39
Xfermode_DstIn 8888: cmsecs = 5.30 565: cmsecs = 24.50
Xfermode_SrcIn 8888: cmsecs = 12.05 565: cmsecs = 25.40
Xfermode_DstOver 8888: cmsecs = 12.45 565: cmsecs = 0.15
Xfermode_SrcOver 8888: cmsecs = 2.68 565: cmsecs = 4.42
after:
Xfermode_Screen 8888: cmsecs = 13.68 565: cmsecs = 21.73
Xfermode_Modulate 8888: cmsecs = 13.25 565: cmsecs = 20.97
Xfermode_Plus 8888: cmsecs = 9.77 565: cmsecs = 16.71
Xfermode_Xor 8888: cmsecs = 17.64 565: cmsecs = 25.62
Xfermode_DstATop 8888: cmsecs = 15.99 565: cmsecs = 23.74
Xfermode_SrcATop 8888: cmsecs = 15.69 565: cmsecs = 23.40
Xfermode_DstOut 8888: cmsecs = 4.77 565: cmsecs = 11.85
Xfermode_SrcOut 8888: cmsecs = 4.98 565: cmsecs = 11.84
Xfermode_DstIn 8888: cmsecs = 4.68 565: cmsecs = 11.72
Xfermode_SrcIn 8888: cmsecs = 4.93 565: cmsecs = 11.79
Xfermode_DstOver 8888: cmsecs = 5.04 565: cmsecs = 0.15
Xfermode_SrcOver 8888: cmsecs = 2.69 565: cmsecs = 4.42
BUG=skia:
R=mtklein@google.com
Author: qiankun.miao@intel.com
Review URL: https://codereview.chromium.org/232793002
git-svn-id: http://skia.googlecode.com/svn/trunk@14176 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/SkColor_opts_SSE2.h | 25 | ||||
-rw-r--r-- | src/opts/SkXfermode_opts_SSE2.cpp | 169 |
2 files changed, 182 insertions, 12 deletions
diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h index 24ab6f73c8..960c48a023 100644 --- a/src/opts/SkColor_opts_SSE2.h +++ b/src/opts/SkColor_opts_SSE2.h @@ -10,6 +10,10 @@ #include <emmintrin.h> +static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) { + return _mm_add_epi32(alpha, _mm_set1_epi32(1)); +} + // See #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) in SkXfermode.cpp. static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a, const __m128i& b) { @@ -21,6 +25,27 @@ static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a, return prod; } +// Portable version SkAlphaMulQ is in SkColorPriv.h. +static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) { + __m128i mask = _mm_set1_epi32(gMask_00FF00FF); + __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); + + // uint32_t rb = ((c & mask) * scale) >> 8 + __m128i rb = _mm_and_si128(mask, c); + rb = _mm_mullo_epi16(rb, s); + rb = _mm_srli_epi16(rb, 8); + + // uint32_t ag = ((c >> 8) & mask) * scale + __m128i ag = _mm_srli_epi16(c, 8); + ag = _mm_and_si128(ag, mask); + ag = _mm_mullo_epi16(ag, s); + + // (rb & mask) | (ag & ~mask) + rb = _mm_and_si128(mask, rb); + ag = _mm_andnot_si128(mask, ag); + return _mm_or_si128(rb, ag); +} + static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); return _mm_srli_epi32(a, 24); diff --git a/src/opts/SkXfermode_opts_SSE2.cpp b/src/opts/SkXfermode_opts_SSE2.cpp index 01c782b9ef..d1a2632fdb 100644 --- a/src/opts/SkXfermode_opts_SSE2.cpp +++ b/src/opts/SkXfermode_opts_SSE2.cpp @@ -17,6 +17,15 @@ static inline __m128i SkDiv255Round_SSE2(const __m128i& a) { return prod; } +static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) { + __m128i sum = _mm_add_epi32(a, b); + __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255)); + + sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)), + _mm_andnot_si128(cmp, sum)); + return sum; +} + static inline __m128i clamp_div255round_SSE2(const __m128i& prod) { // test if > 0 __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128()); @@ -38,6 +47,130 @@ static inline __m128i clamp_div255round_SSE2(const __m128i& prod) { return ret; } +static __m128i srcover_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src)); + return _mm_add_epi32(src, SkAlphaMulQ_SSE2(dst, isa)); +} + +static __m128i dstover_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst)); + return _mm_add_epi32(dst, SkAlphaMulQ_SSE2(src, ida)); +} + +static __m128i srcin_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i da = SkGetPackedA32_SSE2(dst); + return SkAlphaMulQ_SSE2(src, SkAlpha255To256_SSE2(da)); +} + +static __m128i dstin_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + return SkAlphaMulQ_SSE2(dst, SkAlpha255To256_SSE2(sa)); +} + +static __m128i srcout_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(dst)); + return SkAlphaMulQ_SSE2(src, ida); +} + +static __m128i dstout_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(256), SkGetPackedA32_SSE2(src)); + return SkAlphaMulQ_SSE2(dst, isa); +} + +static __m128i srcatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + + __m128i a = da; + + __m128i r1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(da, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i dstatop_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + + __m128i a = sa; + + __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(sa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i sa = SkGetPackedA32_SSE2(src); + __m128i da = SkGetPackedA32_SSE2(dst); + __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); + __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); + + __m128i a1 = _mm_add_epi32(sa, da); + __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da); + a2 = _mm_slli_epi32(a2, 1); + __m128i a = _mm_sub_epi32(a1, a2); + + __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src)); + __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst)); + __m128i r = _mm_add_epi32(r1, r2); + + __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src)); + __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst)); + __m128i g = _mm_add_epi32(g1, g2); + + __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src)); + __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst)); + __m128i b = _mm_add_epi32(b1, b2); + + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i plus_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i b = saturated_add_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + __m128i g = saturated_add_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i r = saturated_add_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i a = saturated_add_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + +static __m128i modulate_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i a = SkAlphaMulAlpha_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + __m128i r = SkAlphaMulAlpha_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i g = SkAlphaMulAlpha_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i b = SkAlphaMulAlpha_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + static inline __m128i srcover_byte_SSE2(const __m128i& a, const __m128i& b) { // a + b - SkAlphaMulAlpha(a, b); return _mm_sub_epi32(_mm_add_epi32(a, b), SkAlphaMulAlpha_SSE2(a, b)); @@ -84,6 +217,18 @@ static __m128i multiply_modeproc_SSE2(const __m128i& src, const __m128i& dst) { return SkPackARGB32_SSE2(a, r, g, b); } +static __m128i screen_modeproc_SSE2(const __m128i& src, const __m128i& dst) { + __m128i a = srcover_byte_SSE2(SkGetPackedA32_SSE2(src), + SkGetPackedA32_SSE2(dst)); + __m128i r = srcover_byte_SSE2(SkGetPackedR32_SSE2(src), + SkGetPackedR32_SSE2(dst)); + __m128i g = srcover_byte_SSE2(SkGetPackedG32_SSE2(src), + SkGetPackedG32_SSE2(dst)); + __m128i b = srcover_byte_SSE2(SkGetPackedB32_SSE2(src), + SkGetPackedB32_SSE2(dst)); + return SkPackARGB32_SSE2(a, r, g, b); +} + //////////////////////////////////////////////////////////////////////////////// typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); @@ -226,18 +371,18 @@ SkXfermodeProcSIMD gSSE2XfermodeProcs[] = { NULL, // kClear_Mode NULL, // kSrc_Mode NULL, // kDst_Mode - NULL, // kSrcOver_Mode - NULL, // kDstOver_Mode - NULL, // kSrcIn_Mode - NULL, // kDstIn_Mode - NULL, // kSrcOut_Mode - NULL, // kDstOut_Mode - NULL, // kSrcATop_Mode - NULL, // kDstATop_Mode - NULL, // kXor_Mode - NULL, // kPlus_Mode - NULL, // kModulate_Mode - NULL, // kScreen_Mode + srcover_modeproc_SSE2, + dstover_modeproc_SSE2, + srcin_modeproc_SSE2, + dstin_modeproc_SSE2, + srcout_modeproc_SSE2, + dstout_modeproc_SSE2, + srcatop_modeproc_SSE2, + dstatop_modeproc_SSE2, + xor_modeproc_SSE2, + plus_modeproc_SSE2, + modulate_modeproc_SSE2, + screen_modeproc_SSE2, NULL, // kOverlay_Mode NULL, // kDarken_Mode |