diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-08-08 10:51:45 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-08-08 10:51:45 +0000 |
commit | 641a2491964f7e5921fad6649be08175e03b0434 (patch) | |
tree | 89d0b39e0dfb2f48c68c0da7ebce7c5fe421e145 /src | |
parent | d55e357a8e276d1f1cd6231d57dda339a2beb76f (diff) |
ARM Skia NEON patches - 11 - Blitter_RGB16
Blitter_RGB16: fixes and improvements
- fix alpha calculation: it was still using the old version of
SkAlpha255To256. 11 more tests pass in gm.
- clean a lot the code: the existing code was "a bit" messy with
a lot of duplicated hardcoded constants, got rid of all this.
- improve speed a little: part of it as a side-effect of the change
in the way alpha is calculated but also by grouping loads and stores.
One "issue" was present and still remains: the NEON code doesn't give
the same result as the black blitter on black. It accounts for dozens
of mismatches in gm. Is this considered "not too bad"? Would you be
interested in a NEON version of the black blitter? The current
comments seem to indicate that the black blitter is here only to give
a performance boost when NEON is not presents so I didn't write a NEON
version.
BUG=
R=djsollen@google.com, tomhudson@google.com, reed@google.com
Author: kevin.petit.arm@gmail.com
Review URL: https://chromiumcodereview.appspot.com/18666005
git-svn-id: http://skia.googlecode.com/svn/trunk@10635 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/core/SkBlitter_RGB16.cpp | 100 |
1 files changed, 45 insertions, 55 deletions
diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp index cb5723254d..256cbc6936 100644 --- a/src/core/SkBlitter_RGB16.cpp +++ b/src/core/SkBlitter_RGB16.cpp @@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, do { int w = width; if (w >= UNROLL) { - uint32x4_t color; /* can use same one */ - uint32x4_t dev_lo, dev_hi; - uint32x4_t t1; - uint32x4_t wn1, wn2; - uint16x4_t odev_lo, odev_hi; - uint16x4_t alpha_lo, alpha_hi; - uint16x8_t alpha_full; - + uint32x4_t color, dev_lo, dev_hi; + uint32x4_t wn1, wn2, tmp; + uint32x4_t vmask_g16, vmask_ng16; + uint16x8_t valpha, vdev; + uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi; + + // prepare constants + vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE); + vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE); color = vdupq_n_u32(expanded32); do { - /* alpha is 8x8, widen and split to get pair of 16x4's */ - alpha_full = vmovl_u8(vld1_u8(alpha)); - alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7)); - alpha_full = vshrq_n_u16(alpha_full, 3); - alpha_lo = vget_low_u16(alpha_full); - alpha_hi = vget_high_u16(alpha_full); - - dev_lo = vmovl_u16(vld1_u16(device)); - dev_hi = vmovl_u16(vld1_u16(device+4)); - - /* unpack in 32 bits */ - dev_lo = vorrq_u32( - vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), - vshlq_n_u32(vandq_u32(dev_lo, - vdupq_n_u32(0x000007E0)), - 16) - ); - dev_hi = vorrq_u32( - vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), - vshlq_n_u32(vandq_u32(dev_hi, - vdupq_n_u32(0x000007E0)), - 16) - ); - - /* blend the two */ - t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo)); - t1 = vshrq_n_u32(t1, 5); - dev_lo = vaddq_u32(dev_lo, t1); - - t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi)); - t1 = vshrq_n_u32(t1, 5); - dev_hi = vaddq_u32(dev_hi, t1); - - /* re-compact and store */ - wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), - wn2 = vshrq_n_u32(dev_lo, 16); - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); - odev_lo = vmovn_u32(vorrq_u32(wn1, wn2)); - - wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), - wn2 = vshrq_n_u32(dev_hi, 16); - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); - odev_hi = vmovn_u32(vorrq_u32(wn1, wn2)); - - vst1_u16(device, odev_lo); - vst1_u16(device+4, odev_hi); + // alpha is 8x8, widen and split to get a pair of 16x4 + valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha)); + valpha = vshrq_n_u16(valpha, 3); + valpha_lo = vget_low_u16(valpha); + valpha_hi = vget_high_u16(valpha); + + // load pixels + vdev = vld1q_u16(device); + dev_lo = vmovl_u16(vget_low_u16(vdev)); + dev_hi = vmovl_u16(vget_high_u16(vdev)); + + // unpack them in 32 bits + dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16); + dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16); + + // blend with color + tmp = (color - dev_lo) * vmovl_u16(valpha_lo); + tmp = vshrq_n_u32(tmp, 5); + dev_lo += tmp; + + tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi)); + tmp = vshrq_n_u32(tmp, 5); + dev_hi += tmp; + + // re-compact + wn1 = dev_lo & vmask_ng16; + wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16; + odev_lo = vmovn_u32(wn1 | wn2); + + wn1 = dev_hi & vmask_ng16; + wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16; + odev_hi = vmovn_u32(wn1 | wn2); + + // store + vst1q_u16(device, vcombine_u16(odev_lo, odev_hi)); device += UNROLL; alpha += UNROLL; @@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, } while (w >= UNROLL); } - /* residuals (which is everything if we have no neon) */ + // residuals while (w > 0) { *device = blend_compact(expanded32, SkExpand_rgb_16(*device), SkAlpha255To256(*alpha++) >> 3); |