aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-08-01 14:23:23 +0000
committerGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-08-01 14:23:23 +0000
commit1fdc6774280ffc18dd7e1247e430931aa2f58790 (patch)
tree739c0693b40d856c39b6113059b2219591e7c5b0
parent251176ebe92417e956ed696e608939156a68585c (diff)
ARM Skia NEON patches - 14 - S32A_Blend
Blitrow32: S32A_Blend new NEON version Adding a NEON version of S32A_Blend_BlitRow32. Here are the benchmark results: +-------+--------------------------+--------------------------+ | | Speedup vs. C | Speedup vs. ARM asm | | count +------------+-------------+------------+-------------+ | | Cortex A-9 | Cortex A-15 | Cortex A-9 | Cortex A-15 | +-------+------------+-------------+------------+-------------+ | 1 | +8,5% | +18,5% | +0.9% | +2,9% | +-------+------------+-------------+------------+-------------+ | 2 | +65,6% | +94% | +70,3% | +80% | +-------+------------+-------------+------------+-------------+ | 4 | +42,4% | +87,8% | +56,8% | +84,4% | +-------+------------+-------------+------------+-------------+ | 8 | +30% | +90% | +49,9% | +82,7% | +-------+------------+-------------+------------+-------------+ | 16 | +23,1% | +95,4% | +46,6% | +87,6% | +-------+------------+-------------+------------+-------------+ | 64 | +23,1% | +95,7% | +46,1% | +89,4% | +-------+------------+-------------+------------+-------------+ | 256 | +35,5% | +122% | +53,6% | +99,2% | +-------+------------+-------------+------------+-------------+ | 1024 | +61,8% | +101% | +64,2% | +91,2% | +-------+------------+-------------+------------+-------------+ BUG= R=djsollen@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/18614010 git-svn-id: http://skia.googlecode.com/svn/trunk@10480 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--src/opts/SkBlitRow_opts_arm_neon.cpp94
1 files changed, 93 insertions, 1 deletions
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 7868108378..200d85d1ea 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -760,6 +760,98 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
}
}
+void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha) {
+
+ SkASSERT(255 >= alpha);
+
+ if (count <= 0) {
+ return;
+ }
+
+ unsigned alpha256 = SkAlpha255To256(alpha);
+
+ // First deal with odd counts
+ if (count & 1) {
+ uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
+ uint16x8_t vdst_wide, vsrc_wide;
+ unsigned dst_scale;
+
+ // Load
+ vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
+ vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
+
+ // Calc dst_scale
+ dst_scale = vget_lane_u8(vsrc, 3);
+ dst_scale *= alpha256;
+ dst_scale >>= 8;
+ dst_scale = 256 - dst_scale;
+
+ // Process src
+ vsrc_wide = vmovl_u8(vsrc);
+ vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
+
+ // Process dst
+ vdst_wide = vmovl_u8(vdst);
+ vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
+
+ // Combine
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+ vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
+ dst++;
+ src++;
+ count--;
+ }
+
+ if (count) {
+ uint8x8_t alpha_mask;
+ static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
+ alpha_mask = vld1_u8(alpha_mask_setup);
+
+ do {
+
+ uint8x8_t vsrc, vdst, vres, vsrc_alphas;
+ uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
+
+ __builtin_prefetch(src+32);
+ __builtin_prefetch(dst+32);
+
+ // Load
+ vsrc = vreinterpret_u8_u32(vld1_u32(src));
+ vdst = vreinterpret_u8_u32(vld1_u32(dst));
+
+ // Prepare src_scale
+ vsrc_scale = vdupq_n_u16(alpha256);
+
+ // Calc dst_scale
+ vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
+ vdst_scale = vmovl_u8(vsrc_alphas);
+ vdst_scale *= vsrc_scale;
+ vdst_scale = vshrq_n_u16(vdst_scale, 8);
+ vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
+
+ // Process src
+ vsrc_wide = vmovl_u8(vsrc);
+ vsrc_wide *= vsrc_scale;
+
+ // Process dst
+ vdst_wide = vmovl_u8(vdst);
+ vdst_wide *= vdst_scale;
+
+ // Combine
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+
+ vst1_u32(dst, vreinterpret_u32_u8(vres));
+
+ src += 2;
+ dst += 2;
+ count -= 2;
+ } while(count);
+ }
+}
+
///////////////////////////////////////////////////////////////////////////////
#undef DEBUG_OPAQUE_DITHER
@@ -1273,5 +1365,5 @@ const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
#else
S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
#endif
- S32A_Blend_BlitRow32_arm // S32A_Blend
+ S32A_Blend_BlitRow32_neon // S32A_Blend
};