diff options
author | 2014-04-09 09:46:31 +0000 | |
---|---|---|
committer | 2014-04-09 09:46:31 +0000 | |
commit | 95c2e5532b094add82b007bdfcd4c64050b6b366 (patch) | |
tree | ee2cb13dd0eb34fe864b80fbe732b19b7c526300 /src | |
parent | e1d94437585dad1c195d7cf095f8a5a8219d196a (diff) |
ARM Skia NEON patches - 22 - S32_D565_Blend
BlitRow565: new NEON version of S32_D565_Blend
This new implementation brings a good speedup in most cases and
gives exact results (removes one mismatch in gm).
Here are the benchmark results (speedup vs. existing S32A_D565_Blend):
+-------+-----------+------------+
| count | Cortex-A9 | Cortex-A15 |
+-------+-----------+------------+
| 1 | -26,7% | -27,5% |
+-------+-----------+------------+
| 2 | 0% | +53% |
+-------+-----------+------------+
| 4 | +38,3% | +26,5% |
+-------+-----------+------------+
| 8 | +10,9% | -4,5% |
+-------+-----------+------------+
| 16 | +18,2% | +1,6% |
+-------+-----------+------------+
| 64 | +22,3% | +8,75% |
+-------+-----------+------------+
| 256 | +12,3% | +11,2% |
+-------+-----------+------------+
| 1024 | +79,2% | +10,9% |
+-------+-----------+------------+
Signed-off-by: Kévin PETIT <kevin.petit@arm.com>
BUG=skia:
R=djsollen@google.com, mtklein@google.com
Author: kevin.petit@arm.com
Review URL: https://codereview.chromium.org/181523002
git-svn-id: http://skia.googlecode.com/svn/trunk@14103 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 89 |
1 files changed, 85 insertions, 4 deletions
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 67b42c9e26..950e4f71d1 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -52,6 +52,90 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, }; } +void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 > alpha); + + uint16x8_t vmask_blue, vscale; + + // prepare constants + vscale = vdupq_n_u16(SkAlpha255To256(alpha)); + vmask_blue = vmovq_n_u16(0x1F); + + while (count >= 8) { + uint16x8_t vdst, vdst_r, vdst_g, vdst_b; + uint16x8_t vres_r, vres_g, vres_b; + uint8x8_t vsrc_r, vsrc_g, vsrc_b; + + // Load src + { + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm ( + "vld4.8 {d0-d3},[%[src]]!" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) + : + ); + vsrc_g = d1; +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) + vsrc_r = d2; vsrc_b = d0; +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) + vsrc_r = d0; vsrc_b = d2; +#endif + } + + // Load and unpack dst + vdst = vld1q_u16(dst); + vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes + vdst_b = vandq_u16(vdst, vmask_blue); // extract blue + vdst_r = vshrq_n_u16(vdst, 6+5); // extract red + vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green + + // Shift src to 565 + vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range + vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range + vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range + + // Scale src - dst + vres_r = vmovl_u8(vsrc_r) - vdst_r; + vres_g = vmovl_u8(vsrc_g) - vdst_g; + vres_b = vmovl_u8(vsrc_b) - vdst_b; + + vres_r = vshrq_n_u16(vres_r * vscale, 8); + vres_g = vshrq_n_u16(vres_g * vscale, 8); + vres_b = vshrq_n_u16(vres_b * vscale, 8); + + vres_r += vdst_r; + vres_g += vdst_g; + vres_b += vdst_b; + + // Combine + vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue + vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue + + // Store + vst1q_u16(dst, vres_b); + dst += 8; + count -= 8; + } + if (count > 0) { + int scale = SkAlpha255To256(alpha); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + uint16_t d = *dst; + *dst++ = SkPackRGB16( + SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), + SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), + SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); + } while (--count != 0); + } +} + void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { @@ -1385,11 +1469,8 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { // no dither - // NOTE: For the S32_D565_Blend function below, we don't have a special - // version that assumes that each source pixel is opaque. But our - // S32A is still faster than the default, so use it. S32_D565_Opaque_neon, - S32A_D565_Blend_neon, // really S32_D565_Blend + S32_D565_Blend_neon, S32A_D565_Opaque_neon, S32A_D565_Blend_neon, |