From 54299654e964ab53189f774e30bce2adebbdc857 Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Mon, 14 Apr 2014 14:54:22 +0000 Subject: Xfermode: SSE2 implementation of a number of simple transfer modes These modes share some common code and not very complex, so group them together. This CL yields about 50% performance improvement on desktop i7-3770. Here are the data: before: Xfermode_Screen 8888: cmsecs = 30.25 565: cmsecs = 46.81 Xfermode_Modulate 8888: cmsecs = 22.48 565: cmsecs = 40.06 Xfermode_Plus 8888: cmsecs = 21.04 565: cmsecs = 37.51 Xfermode_Xor 8888: cmsecs = 37.18 565: cmsecs = 52.53 Xfermode_DstATop 8888: cmsecs = 28.97 565: cmsecs = 46.42 Xfermode_SrcATop 8888: cmsecs = 29.74 565: cmsecs = 46.25 Xfermode_DstOut 8888: cmsecs = 5.34 565: cmsecs = 24.53 Xfermode_SrcOut 8888: cmsecs = 12.25 565: cmsecs = 24.39 Xfermode_DstIn 8888: cmsecs = 5.30 565: cmsecs = 24.50 Xfermode_SrcIn 8888: cmsecs = 12.05 565: cmsecs = 25.40 Xfermode_DstOver 8888: cmsecs = 12.45 565: cmsecs = 0.15 Xfermode_SrcOver 8888: cmsecs = 2.68 565: cmsecs = 4.42 after: Xfermode_Screen 8888: cmsecs = 13.68 565: cmsecs = 21.73 Xfermode_Modulate 8888: cmsecs = 13.25 565: cmsecs = 20.97 Xfermode_Plus 8888: cmsecs = 9.77 565: cmsecs = 16.71 Xfermode_Xor 8888: cmsecs = 17.64 565: cmsecs = 25.62 Xfermode_DstATop 8888: cmsecs = 15.99 565: cmsecs = 23.74 Xfermode_SrcATop 8888: cmsecs = 15.69 565: cmsecs = 23.40 Xfermode_DstOut 8888: cmsecs = 4.77 565: cmsecs = 11.85 Xfermode_SrcOut 8888: cmsecs = 4.98 565: cmsecs = 11.84 Xfermode_DstIn 8888: cmsecs = 4.68 565: cmsecs = 11.72 Xfermode_SrcIn 8888: cmsecs = 4.93 565: cmsecs = 11.79 Xfermode_DstOver 8888: cmsecs = 5.04 565: cmsecs = 0.15 Xfermode_SrcOver 8888: cmsecs = 2.69 565: cmsecs = 4.42 BUG=skia: R=mtklein@google.com Author: qiankun.miao@intel.com Review URL: https://codereview.chromium.org/232793002 git-svn-id: http://skia.googlecode.com/svn/trunk@14176 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/opts/SkColor_opts_SSE2.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'src/opts/SkColor_opts_SSE2.h') diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h index 24ab6f73c8..960c48a023 100644 --- a/src/opts/SkColor_opts_SSE2.h +++ b/src/opts/SkColor_opts_SSE2.h @@ -10,6 +10,10 @@ #include +static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) { + return _mm_add_epi32(alpha, _mm_set1_epi32(1)); +} + // See #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) in SkXfermode.cpp. static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a, const __m128i& b) { @@ -21,6 +25,27 @@ static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a, return prod; } +// Portable version SkAlphaMulQ is in SkColorPriv.h. +static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) { + __m128i mask = _mm_set1_epi32(gMask_00FF00FF); + __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); + + // uint32_t rb = ((c & mask) * scale) >> 8 + __m128i rb = _mm_and_si128(mask, c); + rb = _mm_mullo_epi16(rb, s); + rb = _mm_srli_epi16(rb, 8); + + // uint32_t ag = ((c >> 8) & mask) * scale + __m128i ag = _mm_srli_epi16(c, 8); + ag = _mm_and_si128(ag, mask); + ag = _mm_mullo_epi16(ag, s); + + // (rb & mask) | (ag & ~mask) + rb = _mm_and_si128(mask, rb); + ag = _mm_andnot_si128(mask, ag); + return _mm_or_si128(rb, ag); +} + static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) { __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT)); return _mm_srli_epi32(a, 24); -- cgit v1.2.3