aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar henrik.smiding <henrik.smiding@intel.com>2015-03-20 09:20:46 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-03-20 09:20:46 -0700
commit70840cbd898df67f603987213164c798415d76bf (patch)
treed6119805a17a04ace94a75748204621f090b4fe2 /src/opts
parent86ad8d643624a55b02e529100bbe4e2940115fa1 (diff)
Replace SSE optimization of Color32A_D565
Adds an SSE2 version of the Color32A_D565 function, to replace the existing SSE4 version. Also does some minor cleanup. Performance improvement in the following Skia benchmarks. Measured on Atom Silvermont: Xfermode_SrcOver - x3 luma_colorfilter_large - x4.6 luma_colorfilter_small - x2 tablebench - ~15% chart_bw - ~10% Measured on Corei7 Haswell: luma_colorfilter_large running SSE2 - x2 luma_colorfilter_large running SSE4 - x2.3 Also improves performance in WPS Office application and 2D subtest of 0xbenchmark on Android. Signed-off-by: Henrik Smiding <henrik.smiding@intel.com> Review URL: https://codereview.chromium.org/923523002
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkBitmapProcState_opts_SSE2.h2
-rw-r--r--src/opts/SkBlitRow_opts_SSE2.cpp69
-rw-r--r--src/opts/SkBlitRow_opts_SSE2.h7
-rw-r--r--src/opts/SkBlitRow_opts_SSE4.cpp79
-rw-r--r--src/opts/SkBlitRow_opts_SSE4.h3
-rw-r--r--src/opts/opts_check_x86.cpp13
6 files changed, 85 insertions, 88 deletions
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 82c5cc8d6e..82bf2cdae1 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -16,8 +16,6 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
const uint32_t* xy,
int count, uint32_t* colors);
-void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
- SkPMColor color);
void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
int count, int x, int y);
void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index e830c5fa06..80fdeecbcb 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -289,6 +289,75 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
}
}
+void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
+ SkASSERT(count > 0);
+
+ uint32_t src_expand = (SkGetPackedG32(src) << 24) |
+ (SkGetPackedR32(src) << 13) |
+ (SkGetPackedB32(src) << 2);
+ unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
+
+ // Check if we have enough pixels to run SIMD
+ if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
+ __m128i* dst_wide;
+ const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
+ const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
+ const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
+ const __m128i scale_wide = _mm_set1_epi16(scale);
+ const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
+ const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
+
+ // Align dst to an even 16 byte address (0-7 pixels)
+ while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
+ *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+ dst += 1;
+ count--;
+ }
+
+ dst_wide = reinterpret_cast<__m128i*>(dst);
+ do {
+ // Load eight RGB565 pixels
+ __m128i pixels = _mm_load_si128(dst_wide);
+
+ // Mask out sub-pixels
+ __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
+ __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
+ pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
+ __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
+
+ // Scale with alpha
+ pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
+ pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
+ pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
+
+ // Add src_X_wide and shift down again
+ pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
+ pixel_R = _mm_srli_epi16(pixel_R, 5);
+ pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
+ pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
+ pixel_B = _mm_srli_epi16(pixel_B, 5);
+
+ // Combine into RGB565 and store
+ pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
+ pixel_G = _mm_and_si128(pixel_G, mask_green);
+ pixels = _mm_or_si128(pixel_R, pixel_G);
+ pixels = _mm_or_si128(pixels, pixel_B);
+ _mm_store_si128(dst_wide, pixels);
+ count -= 8;
+ dst_wide++;
+ } while (count >= 8);
+
+ dst = reinterpret_cast<uint16_t*>(dst_wide);
+ }
+
+ // Small loop to handle remaining pixels.
+ while (count > 0) {
+ *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+ dst += 1;
+ count--;
+ }
+}
+
void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
size_t maskRB, SkColor origColor,
int width, int height) {
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 29fd96e5e9..bb6cece478 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -21,6 +21,12 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
+
+void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
+ SkPMColor color);
+void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x,
+ int y);
+
void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask,
size_t maskRB, SkColor color,
int width, int height);
@@ -42,5 +48,4 @@ void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha, int x, int y);
-
#endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
index f4273d27b4..3649d175ef 100644
--- a/src/opts/SkBlitRow_opts_SSE4.cpp
+++ b/src/opts/SkBlitRow_opts_SSE4.cpp
@@ -7,14 +7,9 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
sk_throw();
}
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
- sk_throw();
-}
-
#else
-#include <smmintrin.h> // SSE4.1 intrinsics
-
+#include <smmintrin.h> // SSE4.1 intrinsics
#include "SkColorPriv.h"
#include "SkColor_opts_SSE2.h"
@@ -66,76 +61,4 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
}
}
-static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) {
- uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
- return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
-}
-
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
- SkASSERT(count > 0);
-
- uint32_t src_expand = (SkGetPackedG32(src) << 24) |
- (SkGetPackedR32(src) << 13) |
- (SkGetPackedB32(src) << 2);
- unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
-
- // Check if we have enough pixels to run SIMD
- if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
- __m128i* dst_wide;
- const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
- const __m128i scale_wide = _mm_set1_epi32(scale);
- const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
- SK_B16_MASK_IN_PLACE |
- (SK_G16_MASK_IN_PLACE << 16));
-
- // Align dst to an even 16 byte address (0-7 pixels)
- while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
- *dst = Color32A_D565_1x(*dst, scale, src_expand);
- dst += 1;
- count--;
- }
-
- dst_wide = reinterpret_cast<__m128i*>(dst);
- do {
- // Load 8 RGB565 pixels
- __m128i pixels = _mm_load_si128(dst_wide);
-
- // Duplicate and mask
- __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
- pixels_high = _mm_and_si128(mask_green, pixels_high);
- pixels = _mm_unpacklo_epi16(pixels, pixels);
- pixels = _mm_and_si128(mask_green, pixels);
-
- // Scale with alpha
- pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
- pixels = _mm_mullo_epi32(pixels, scale_wide);
-
- // Add src_expand_wide and shift down again
- pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
- pixels_high = _mm_srli_epi32(pixels_high, 5);
- pixels = _mm_add_epi32(pixels, src_expand_wide);
- pixels = _mm_srli_epi32(pixels, 5);
-
- // Mask
- pixels_high = _mm_and_si128(mask_green, pixels_high);
- pixels = _mm_and_si128(mask_green, pixels);
-
- // Combine into RGB565 and store
- pixels = _mm_hadd_epi16(pixels, pixels_high);
- _mm_store_si128(dst_wide, pixels);
- count -= 8;
- dst_wide++;
- } while (count >= 8);
-
- dst = reinterpret_cast<uint16_t*>(dst_wide);
- }
-
- // Small loop to handle remaining pixels.
- while (count > 0) {
- *dst = Color32A_D565_1x(*dst, scale, src_expand);
- dst += 1;
- count--;
- }
-}
-
#endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h
index 6a572161e1..577ace6f8f 100644
--- a/src/opts/SkBlitRow_opts_SSE4.h
+++ b/src/opts/SkBlitRow_opts_SSE4.h
@@ -14,8 +14,5 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
const SkPMColor* SK_RESTRICT,
int count,
U8CPU alpha);
-
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y);
-
#endif
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 6b9758c123..7314f7dcf8 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -215,14 +215,19 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
}
}
-static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE4[] = {
- Color32A_D565_SSE4, // Color32A_D565,
+static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = {
+ Color32A_D565_SSE2, // Color32A_D565,
NULL, // Color32A_D565_Dither
};
SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
- if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
- return platform_565_colorprocs_SSE4[flags];
+/* If you're thinking about writing an SSE4 version of this, do check it's
+ * actually faster on Atom. Our original SSE4 version was slower than this
+ * SSE2 version on Silvermont, and only marginally faster on a Core i7,
+ * mainly due to the MULLD timings.
+ */
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+ return platform_565_colorprocs_SSE2[flags];
} else {
return NULL;
}