aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/SkColor_opts_SSE2.h
diff options
context:
space:
mode:
authorGravatar lsalzman <lsalzman@mozilla.com>2016-08-05 11:48:45 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-08-05 11:48:45 -0700
commit40254c2c2dc28a34f96294d5a1ad94a99b0be8a6 (patch)
treedbc9c6b38ca6be76f0011623f3a529a60ce1d570 /src/opts/SkColor_opts_SSE2.h
parentf77c47b78217883e6f074dd7e3e5bed5f82e144d (diff)
SkBlendARGB32 and S32[A]_Blend_BlitRow32 are currently formulated as: SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(dst, dst_scale), which boils down to ((src*src_scale)>>8) + ((dst*dst_scale)>>8). In particular, note that the intermediate precision is discarded before the two parts are added together, causing the final result to possibly inaccurate.
In Firefox, we use SkCanvas::saveLayer in combination with a backdrop that initializes the layer to the background. When this is blended back onto background using transparency, where the source and destination pixel colors are the same, the resulting color after the blend is not preserved due to the lost precision mentioned above. In cases where this operation is repeatedly performed, this causes substantially noticeable differences in color as evidenced in this downstream Firefox bug report: https://bugzilla.mozilla.org/show_bug.cgi?id=1200684 In the test-case in the downstream report, essentially it does blend(src=0xFF2E3338, dst=0xFF2E3338, scale=217), which gives the result 0xFF2E3237, while we would expect to get back 0xFF2E3338. This problem goes away if the blend is instead reformulated to effectively do (src*src_scale + dst*dst_scale)>>8, which keeps the intermediate precision during the addition before shifting it off. This modifies the blending operations thusly. The performance should remain mostly unchanged, or possibly improve slightly, so there should be no real downside to doing this, with the benefit of making the results more accurate. Without this, it is currently unsafe for Firefox to blend a layer back onto itself that was initialized with a copy of its background. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2097883002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot [mtklein adds...] No public API changes. TBR=reed@google.com Review-Url: https://codereview.chromium.org/2097883002
Diffstat (limited to 'src/opts/SkColor_opts_SSE2.h')
-rw-r--r--src/opts/SkColor_opts_SSE2.h101
1 files changed, 80 insertions, 21 deletions
diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h
index feb1d98f8d..a3db880598 100644
--- a/src/opts/SkColor_opts_SSE2.h
+++ b/src/opts/SkColor_opts_SSE2.h
@@ -80,6 +80,42 @@ static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const unsigned scale) {
return _mm_or_si128(rb, ag);
}
+// Portable version SkFastFourByteInterp256 is in SkColorPriv.h.
+static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m128i& dst, const unsigned src_scale) {
+ // Computes dst + (((src - dst)*src_scale)>>8)
+ const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+ // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
+ __m128i src_rb = _mm_and_si128(mask, src);
+ __m128i src_ag = _mm_srli_epi16(src, 8);
+ __m128i dst_rb = _mm_and_si128(mask, dst);
+ __m128i dst_ag = _mm_srli_epi16(dst, 8);
+
+ // Compute scaled differences.
+ __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
+ __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
+ __m128i s = _mm_set1_epi16(src_scale);
+ diff_rb = _mm_mullo_epi16(diff_rb, s);
+ diff_ag = _mm_mullo_epi16(diff_ag, s);
+
+ // Pack the differences back together.
+ diff_rb = _mm_srli_epi16(diff_rb, 8);
+ diff_ag = _mm_andnot_si128(mask, diff_ag);
+ __m128i diff = _mm_or_si128(diff_rb, diff_ag);
+
+ // Add difference to destination.
+ return _mm_add_epi8(dst, diff);
+}
+
+// Portable version SkPMLerp is in SkColorPriv.h
+static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, const unsigned scale) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+ return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256 - scale));
+#else
+ return SkFastFourByteInterp256_SSE2(src, dst, scale);
+#endif
+}
+
static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
#if SK_A32_SHIFT == 24 // It's very common (universal?) that alpha is the top byte.
return _mm_srli_epi32(src, 24); // You'd hope the compiler would remove the left shift then,
@@ -213,33 +249,56 @@ static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
SkGetPackedA32_SSE2(src))));
}
-// Portable version is SkBlendARGB32 in SkColorPriv.h.
-static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
- const __m128i& aa) {
- __m128i src_scale = SkAlpha255To256_SSE2(aa);
- // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
- __m128i dst_scale = SkGetPackedA32_SSE2(src);
- dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
- dst_scale = _mm_srli_epi16(dst_scale, 8);
- dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
-
- __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
- return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
-}
-
// Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
const unsigned aa) {
unsigned alpha = SkAlpha255To256(aa);
- __m128i src_scale = _mm_set1_epi32(alpha);
- // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+ __m128i src_scale = _mm_set1_epi32(alpha);
+ // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+ __m128i dst_scale = SkGetPackedA32_SSE2(src);
+ dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+ dst_scale = _mm_srli_epi16(dst_scale, 8);
+ dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+
+ __m128i result = SkAlphaMulQ_SSE2(src, alpha);
+ return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+#else
+ __m128i src_scale = _mm_set1_epi16(alpha);
+ // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
__m128i dst_scale = SkGetPackedA32_SSE2(src);
+ // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
- dst_scale = _mm_srli_epi16(dst_scale, 8);
- dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
-
- __m128i result = SkAlphaMulQ_SSE2(src, alpha);
- return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+ dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
+ dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
+ dst_scale = _mm_srli_epi32(dst_scale, 8);
+ // Duplicate scales into 2x16-bit pattern per pixel.
+ dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+ dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+ // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
+ __m128i src_rb = _mm_and_si128(mask, src);
+ __m128i src_ag = _mm_srli_epi16(src, 8);
+ __m128i dst_rb = _mm_and_si128(mask, dst);
+ __m128i dst_ag = _mm_srli_epi16(dst, 8);
+
+ // Scale them.
+ src_rb = _mm_mullo_epi16(src_rb, src_scale);
+ src_ag = _mm_mullo_epi16(src_ag, src_scale);
+ dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
+ dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
+
+ // Add the scaled source and destination.
+ dst_rb = _mm_add_epi16(src_rb, dst_rb);
+ dst_ag = _mm_add_epi16(src_ag, dst_ag);
+
+ // Unsplay the halves back together.
+ dst_rb = _mm_srli_epi16(dst_rb, 8);
+ dst_ag = _mm_andnot_si128(mask, dst_ag);
+ return _mm_or_si128(dst_rb, dst_ag);
+#endif
}
#undef ASSERT_EQ