aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/SkBlitMask_opts_arm_neon.cpp
diff options
context:
space:
mode:
authorGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-11-27 17:08:36 +0000
committerGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-11-27 17:08:36 +0000
commitdbe7f52412f55561dcc3a51fa3df2779c9a368bf (patch)
tree857a4d40cf4b032f94de79870863b297f1196194 /src/opts/SkBlitMask_opts_arm_neon.cpp
parenta9a4b04a98f9990e4e11b164b2eed37416768f21 (diff)
ARM Skia NEON patches - 16/17 - Blitmask
Blitmask: NEON optimised version of the D32_A8 functions Here are the microbenchmark results I got for the D32_A8 functions: Cortex-A9: ========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -14% | -39,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -3% | -29,9% | -25% | +-------+--------+--------+--------+ | 4 | -11,3% | -22% | -14,5% | +-------+--------+--------+--------+ | 8 | +128% | +66,6% | +105% | +-------+--------+--------+--------+ | 16 | +159% | +102% | +149% | +-------+--------+--------+--------+ | 64 | +189% | +136% | +189% | +-------+--------+--------+--------+ | 256 | +126% | +102% | +149% | +-------+--------+--------+--------+ | 1024 | +67,5% | +81,4% | +123% | +-------+--------+--------+--------+ Cortex-A15: =========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -24% | -46,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -18,5% | -35,5% | -28% | +-------+--------+--------+--------+ | 4 | -5,2% | -17,5% | -15,5% | +-------+--------+--------+--------+ | 8 | +72% | +65,8% | +84,7% | +-------+--------+--------+--------+ | 16 | +168% | +117% | +149% | +-------+--------+--------+--------+ | 64 | +165% | +110% | +145% | +-------+--------+--------+--------+ | 256 | +106% | +99,6% | +141% | +-------+--------+--------+--------+ | 1024 | +93,7% | +94,7% | +130% | +-------+--------+--------+--------+ Blitmask: add NEON optimised PlatformBlitRowProcs16 Here are the microbenchmark results (speedup vs. C code): +-------+-----------------+-----------------+ | | Cortex-A9 | Cortex-A15 | | count +--------+--------+--------+--------+ | | Blend | Opaque | Blend | Opaque | +-------+--------+--------+--------+--------+ | 1 | -19,2% | -36,7% | -33,6% | -44,7% | +-------+--------+--------+--------+--------+ | 2 | -12,6% | -27,8% | -39% | -48% | +-------+--------+--------+--------+--------+ | 4 | -11,5% | -21,6% | -37,7% | -44,3% | +-------+--------+--------+--------+--------+ | 8 | +141% | +59,7% | +123% | +48,7% | +-------+--------+--------+--------+--------+ | 16 | +213% | +119% | +214% | +121% | +-------+--------+--------+--------+--------+ | 64 | +212% | +105% | +242% | +167% | +-------+--------+--------+--------+--------+ | 256 | +289% | +167% | +249% | +207% | +-------+--------+--------+--------+--------+ | 1024 | +273% | +169% | +146% | +220% | +-------+--------+--------+--------+--------+ Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com, mtklein@google.com, reed@google.com Author: kevin.petit.arm@gmail.com Review URL: https://codereview.chromium.org/23719002 git-svn-id: http://skia.googlecode.com/svn/trunk@12420 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts/SkBlitMask_opts_arm_neon.cpp')
-rw-r--r--src/opts/SkBlitMask_opts_arm_neon.cpp255
1 files changed, 255 insertions, 0 deletions
diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp
new file mode 100644
index 0000000000..7db6fcbfb1
--- /dev/null
+++ b/src/opts/SkBlitMask_opts_arm_neon.cpp
@@ -0,0 +1,255 @@
+
+#include "SkBlitMask.h"
+#include "SkColor_opts_neon.h"
+
+static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
+ const void* SK_RESTRICT maskPtr, size_t maskRB,
+ SkColor, int width, int height) {
+ SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
+ const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
+
+ maskRB -= width;
+ dstRB -= (width << 2);
+ do {
+ int w = width;
+ while (w >= 8) {
+ uint8x8_t vmask = vld1_u8(mask);
+ uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
+ uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
+
+ vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
+ vdevice.val[NEON_A] += vmask;
+
+ vst4_u8((uint8_t*)device, vdevice);
+
+ mask += 8;
+ device += 8;
+ w -= 8;
+ }
+ while (w-- > 0) {
+ unsigned aa = *mask++;
+ *device = (aa << SK_A32_SHIFT)
+ + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
+ device += 1;
+ };
+ device = (uint32_t*)((char*)device + dstRB);
+ mask += maskRB;
+ } while (--height != 0);
+}
+
+template <bool isColor>
+static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
+ const void* SK_RESTRICT maskPtr, size_t maskRB,
+ SkColor color, int width, int height) {
+ SkPMColor pmc = SkPreMultiplyColor(color);
+ SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
+ const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
+ uint8x8x4_t vpmc;
+
+ maskRB -= width;
+ dstRB -= (width << 2);
+
+ if (width >= 8) {
+ vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
+ vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
+ vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
+ vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
+ }
+ do {
+ int w = width;
+ while (w >= 8) {
+ uint8x8_t vmask = vld1_u8(mask);
+ uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
+ if (isColor) {
+ vscale = vsubw_u8(vdupq_n_u16(256),
+ SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
+ } else {
+ vscale = vsubw_u8(vdupq_n_u16(256), vmask);
+ }
+ uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
+
+ vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
+ + SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
+ vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
+ + SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
+ vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
+ + SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
+ vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
+ + SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
+
+ vst4_u8((uint8_t*)device, vdev);
+
+ mask += 8;
+ device += 8;
+ w -= 8;
+ }
+
+ while (w--) {
+ unsigned aa = *mask++;
+ if (isColor) {
+ *device = SkBlendARGB32(pmc, *device, aa);
+ } else {
+ *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
+ + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
+ }
+ device += 1;
+ };
+
+ device = (uint32_t*)((char*)device + dstRB);
+ mask += maskRB;
+
+ } while (--height != 0);
+}
+
+static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
+ const void* SK_RESTRICT maskPtr, size_t maskRB,
+ SkColor color, int width, int height) {
+ D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
+}
+
+static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
+ const void* SK_RESTRICT maskPtr, size_t maskRB,
+ SkColor color, int width, int height) {
+ D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
+}
+
+SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
+ if (SK_ColorBLACK == color) {
+ return D32_A8_Black_neon;
+ } else if (0xFF == SkColorGetA(color)) {
+ return D32_A8_Opaque_neon;
+ } else {
+ return D32_A8_Color_neon;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
+ SkColor color, int width,
+ SkPMColor opaqueDst) {
+ int colR = SkColorGetR(color);
+ int colG = SkColorGetG(color);
+ int colB = SkColorGetB(color);
+
+ uint8x8_t vcolR, vcolG, vcolB;
+ uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
+
+ if (width >= 8) {
+ vcolR = vdup_n_u8(colR);
+ vcolG = vdup_n_u8(colG);
+ vcolB = vdup_n_u8(colB);
+ vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
+ vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
+ vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
+ vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
+ }
+
+ while (width >= 8) {
+ uint8x8x4_t vdst;
+ uint16x8_t vmask;
+ uint16x8_t vmaskR, vmaskG, vmaskB;
+ uint8x8_t vsel_trans, vsel_opq;
+
+ vdst = vld4_u8((uint8_t*)dst);
+ vmask = vld1q_u16(src);
+
+ // Prepare compare masks
+ vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
+ vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
+
+ // Get all the color masks on 5 bits
+ vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
+ vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
+ SK_B16_BITS + SK_R16_BITS + 1);
+ vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
+
+ // Upscale to 0..32
+ vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
+ vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
+ vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
+
+ vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
+ vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
+
+ vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
+ vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
+ vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
+
+ vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
+ vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
+ vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
+
+ vst4_u8((uint8_t*)dst, vdst);
+
+ dst += 8;
+ src += 8;
+ width -= 8;
+ }
+
+ // Leftovers
+ for (int i = 0; i < width; i++) {
+ dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
+ opaqueDst);
+ }
+}
+
+void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
+ SkColor color, int width, SkPMColor) {
+ int colA = SkColorGetA(color);
+ int colR = SkColorGetR(color);
+ int colG = SkColorGetG(color);
+ int colB = SkColorGetB(color);
+
+ colA = SkAlpha255To256(colA);
+
+ uint8x8_t vcolR, vcolG, vcolB;
+ uint16x8_t vcolA;
+
+ if (width >= 8) {
+ vcolA = vdupq_n_u16(colA);
+ vcolR = vdup_n_u8(colR);
+ vcolG = vdup_n_u8(colG);
+ vcolB = vdup_n_u8(colB);
+ }
+
+ while (width >= 8) {
+ uint8x8x4_t vdst;
+ uint16x8_t vmask;
+ uint16x8_t vmaskR, vmaskG, vmaskB;
+
+ vdst = vld4_u8((uint8_t*)dst);
+ vmask = vld1q_u16(src);
+
+ // Get all the color masks on 5 bits
+ vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
+ vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
+ SK_B16_BITS + SK_R16_BITS + 1);
+ vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
+
+ // Upscale to 0..32
+ vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
+ vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
+ vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
+
+ vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
+ vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
+ vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
+
+ vdst.val[NEON_A] = vdup_n_u8(0xFF);
+ vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
+ vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
+ vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
+
+ vst4_u8((uint8_t*)dst, vdst);
+
+ dst += 8;
+ src += 8;
+ width -= 8;
+ }
+
+ for (int i = 0; i < width; i++) {
+ dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
+ }
+}
+