diff options
Diffstat (limited to 'src/core/SkBlitter_RGB16.cpp')
-rw-r--r-- | src/core/SkBlitter_RGB16.cpp | 100 |
1 files changed, 45 insertions, 55 deletions
diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp index cb5723254d..256cbc6936 100644 --- a/src/core/SkBlitter_RGB16.cpp +++ b/src/core/SkBlitter_RGB16.cpp @@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, do { int w = width; if (w >= UNROLL) { - uint32x4_t color; /* can use same one */ - uint32x4_t dev_lo, dev_hi; - uint32x4_t t1; - uint32x4_t wn1, wn2; - uint16x4_t odev_lo, odev_hi; - uint16x4_t alpha_lo, alpha_hi; - uint16x8_t alpha_full; - + uint32x4_t color, dev_lo, dev_hi; + uint32x4_t wn1, wn2, tmp; + uint32x4_t vmask_g16, vmask_ng16; + uint16x8_t valpha, vdev; + uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi; + + // prepare constants + vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE); + vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE); color = vdupq_n_u32(expanded32); do { - /* alpha is 8x8, widen and split to get pair of 16x4's */ - alpha_full = vmovl_u8(vld1_u8(alpha)); - alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7)); - alpha_full = vshrq_n_u16(alpha_full, 3); - alpha_lo = vget_low_u16(alpha_full); - alpha_hi = vget_high_u16(alpha_full); - - dev_lo = vmovl_u16(vld1_u16(device)); - dev_hi = vmovl_u16(vld1_u16(device+4)); - - /* unpack in 32 bits */ - dev_lo = vorrq_u32( - vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), - vshlq_n_u32(vandq_u32(dev_lo, - vdupq_n_u32(0x000007E0)), - 16) - ); - dev_hi = vorrq_u32( - vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), - vshlq_n_u32(vandq_u32(dev_hi, - vdupq_n_u32(0x000007E0)), - 16) - ); - - /* blend the two */ - t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo)); - t1 = vshrq_n_u32(t1, 5); - dev_lo = vaddq_u32(dev_lo, t1); - - t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi)); - t1 = vshrq_n_u32(t1, 5); - dev_hi = vaddq_u32(dev_hi, t1); - - /* re-compact and store */ - wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), - wn2 = vshrq_n_u32(dev_lo, 16); - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); - odev_lo = vmovn_u32(vorrq_u32(wn1, wn2)); - - wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), - wn2 = vshrq_n_u32(dev_hi, 16); - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); - odev_hi = vmovn_u32(vorrq_u32(wn1, wn2)); - - vst1_u16(device, odev_lo); - vst1_u16(device+4, odev_hi); + // alpha is 8x8, widen and split to get a pair of 16x4 + valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha)); + valpha = vshrq_n_u16(valpha, 3); + valpha_lo = vget_low_u16(valpha); + valpha_hi = vget_high_u16(valpha); + + // load pixels + vdev = vld1q_u16(device); + dev_lo = vmovl_u16(vget_low_u16(vdev)); + dev_hi = vmovl_u16(vget_high_u16(vdev)); + + // unpack them in 32 bits + dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16); + dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16); + + // blend with color + tmp = (color - dev_lo) * vmovl_u16(valpha_lo); + tmp = vshrq_n_u32(tmp, 5); + dev_lo += tmp; + + tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi)); + tmp = vshrq_n_u32(tmp, 5); + dev_hi += tmp; + + // re-compact + wn1 = dev_lo & vmask_ng16; + wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16; + odev_lo = vmovn_u32(wn1 | wn2); + + wn1 = dev_hi & vmask_ng16; + wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16; + odev_hi = vmovn_u32(wn1 | wn2); + + // store + vst1q_u16(device, vcombine_u16(odev_lo, odev_hi)); device += UNROLL; alpha += UNROLL; @@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, } while (w >= UNROLL); } - /* residuals (which is everything if we have no neon) */ + // residuals while (w > 0) { *device = blend_compact(expanded32, SkExpand_rgb_16(*device), SkAlpha255To256(*alpha++) >> 3); |