aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2014-02-25 10:54:32 +0000
committerGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2014-02-25 10:54:32 +0000
commitfe68eb6a4081f60caf665ec632180e6d7c26a169 (patch)
tree5512538c33b8bb92ff50d4df2a51499af1a612c8 /src/opts
parent90667ba5eb88f5cb715f71add583a1c87efbe6a7 (diff)
ARM Skia NEON patches - 25 - S32A_D565_Opaque_Dither clean/bugfix/speed
BlitRow565: S32A_D565_Opaque_Dither: some improvements - Supports ARGB and ABGR - Less magic numbers - Reduced instruction count : 5-25% speedup - Fixed indentation, removed some commented and useless code Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG=skia: R=djsollen@google.com, mtklein@google.com Author: kevin.petit@arm.com Review URL: https://codereview.chromium.org/177963003 git-svn-id: http://skia.googlecode.com/svn/trunk@13577 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkBlitRow_opts_arm_neon.cpp205
1 files changed, 92 insertions, 113 deletions
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 07570fac6a..67b42c9e26 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -970,9 +970,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
#define UNROLL 8
if (count >= UNROLL) {
- uint8x8_t dbase;
-#if defined(DEBUG_OPAQUE_DITHER)
+#if defined(DEBUG_OPAQUE_DITHER)
uint16_t tmpbuf[UNROLL];
int td[UNROLL];
int tdv[UNROLL];
@@ -983,6 +982,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
int noisy = 0;
#endif
+ uint8x8_t dbase;
const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
dbase = vld1_u8(dstart);
@@ -991,27 +991,27 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
uint16x8_t dst8, scale8, alpha8;
uint16x8_t dst_r, dst_g, dst_b;
-#if defined(DEBUG_OPAQUE_DITHER)
- /* calculate 8 elements worth into a temp buffer */
- {
- int my_y = y;
- int my_x = x;
- SkPMColor* my_src = (SkPMColor*)src;
- uint16_t* my_dst = dst;
- int i;
-
- DITHER_565_SCAN(my_y);
- for(i=0;i<UNROLL;i++) {
+#if defined(DEBUG_OPAQUE_DITHER)
+ // calculate 8 elements worth into a temp buffer
+ {
+ int my_y = y;
+ int my_x = x;
+ SkPMColor* my_src = (SkPMColor*)src;
+ uint16_t* my_dst = dst;
+ int i;
+
+ DITHER_565_SCAN(my_y);
+ for(i = 0; i < UNROLL; i++) {
SkPMColor c = *my_src++;
SkPMColorAssert(c);
if (c) {
unsigned a = SkGetPackedA32(c);
int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
- tdv[i] = DITHER_VALUE(my_x);
- ta[i] = a;
- tap[i] = SkAlpha255To256(a);
- td[i] = d;
+ tdv[i] = DITHER_VALUE(my_x);
+ ta[i] = a;
+ tap[i] = SkAlpha255To256(a);
+ td[i] = d;
unsigned sr = SkGetPackedR32(c);
unsigned sg = SkGetPackedG32(c);
@@ -1025,147 +1025,126 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
// now src and dst expanded are in g:11 r:10 x:1 b:10
tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- td[i] = d;
-
+ td[i] = d;
} else {
- tmpbuf[i] = *my_dst;
- ta[i] = tdv[i] = td[i] = 0xbeef;
- }
- in_dst[i] = *my_dst;
+ tmpbuf[i] = *my_dst;
+ ta[i] = tdv[i] = td[i] = 0xbeef;
+ }
+ in_dst[i] = *my_dst;
my_dst += 1;
DITHER_INC_X(my_x);
- }
- }
+ }
+ }
#endif
- /* source is in ABGR */
+
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
register uint8x8_t d2 asm("d2");
register uint8x8_t d3 asm("d3");
- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
- : "r" (src)
- );
+ asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
+ :
+ );
+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
+ sr = d2; sg = d1; sb = d0; sa = d3;
+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
sr = d0; sg = d1; sb = d2; sa = d3;
+#endif
}
- /* calculate 'd', which will be 0..7 */
- /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
-#if defined(SK_BUILD_FOR_ANDROID)
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
-#else
- alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
-#endif
- alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
- d = vshrn_n_u16(alpha8, 8); /* narrowing too */
+ /* calculate 'd', which will be 0..7
+ * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
+ */
+ alpha8 = vmovl_u8(dbase);
+ alpha8 = vmlal_u8(alpha8, sa, dbase);
+ d = vshrn_n_u16(alpha8, 8); // narrowing too
- /* sr = sr - (sr>>5) + d */
+ // sr = sr - (sr>>5) + d
/* watching for 8-bit overflow. d is 0..7; risky range of
* sr is >248; and then (sr>>5) is 7 so it offsets 'd';
- * safe as long as we do ((sr-sr>>5) + d) */
+ * safe as long as we do ((sr-sr>>5) + d)
+ */
sr = vsub_u8(sr, vshr_n_u8(sr, 5));
sr = vadd_u8(sr, d);
- /* sb = sb - (sb>>5) + d */
+ // sb = sb - (sb>>5) + d
sb = vsub_u8(sb, vshr_n_u8(sb, 5));
sb = vadd_u8(sb, d);
- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows
sg = vsub_u8(sg, vshr_n_u8(sg, 6));
sg = vadd_u8(sg, vshr_n_u8(d,1));
- /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
+ // need to pick up 8 dst's -- at 16 bits each, 128 bits
dst8 = vld1q_u16(dst);
- dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
- dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
- dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
-
- /* blend */
-#if 1
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- /* originally 255-sa + 1 */
+ dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
+ dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
+ dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
+
+ // blend
scale8 = vsubw_u8(vdupq_n_u16(256), sa);
-#else
- scale8 = vsubw_u8(vdupq_n_u16(255), sa);
- scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
-#endif
-#if 1
- /* combine the addq and mul, save 3 insns */
+ // combine the addq and mul, save 3 insns
scale8 = vshrq_n_u16(scale8, 3);
dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
-#else
- /* known correct, but +3 insns over above */
- scale8 = vshrq_n_u16(scale8, 3);
- dst_b = vmulq_u16(dst_b, scale8);
- dst_g = vmulq_u16(dst_g, scale8);
- dst_r = vmulq_u16(dst_r, scale8);
-
- /* combine */
- /* NB: vshll widens, need to preserve those bits */
- dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
- dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
- dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
-#endif
- /* repack to store */
- dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
+ // repack to store
+ dst8 = vshrq_n_u16(dst_b, 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
vst1q_u16(dst, dst8);
-#if defined(DEBUG_OPAQUE_DITHER)
- /* verify my 8 elements match the temp buffer */
- {
- int i, bad=0;
- static int invocation;
-
- for (i=0;i<UNROLL;i++)
- if (tmpbuf[i] != dst[i]) bad=1;
- if (bad) {
- SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
- invocation, offset);
- SkDebugf(" alpha 0x%x\n", alpha);
- for (i=0;i<UNROLL;i++)
- SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
- i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
- dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
-
- showme16("alpha8", &alpha8, sizeof(alpha8));
- showme16("scale8", &scale8, sizeof(scale8));
- showme8("d", &d, sizeof(d));
- showme16("dst8", &dst8, sizeof(dst8));
- showme16("dst_b", &dst_b, sizeof(dst_b));
- showme16("dst_g", &dst_g, sizeof(dst_g));
- showme16("dst_r", &dst_r, sizeof(dst_r));
- showme8("sb", &sb, sizeof(sb));
- showme8("sg", &sg, sizeof(sg));
- showme8("sr", &sr, sizeof(sr));
-
- /* cop out */
- return;
- }
- offset += UNROLL;
- invocation++;
- }
-#endif
+#if defined(DEBUG_OPAQUE_DITHER)
+ // verify my 8 elements match the temp buffer
+ {
+ int i, bad=0;
+ static int invocation;
- dst += UNROLL;
- src += UNROLL;
+ for (i = 0; i < UNROLL; i++) {
+ if (tmpbuf[i] != dst[i]) {
+ bad=1;
+ }
+ }
+ if (bad) {
+ SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
+ invocation, offset);
+ SkDebugf(" alpha 0x%x\n", alpha);
+ for (i = 0; i < UNROLL; i++)
+ SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
+ i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
+ in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
+
+ showme16("alpha8", &alpha8, sizeof(alpha8));
+ showme16("scale8", &scale8, sizeof(scale8));
+ showme8("d", &d, sizeof(d));
+ showme16("dst8", &dst8, sizeof(dst8));
+ showme16("dst_b", &dst_b, sizeof(dst_b));
+ showme16("dst_g", &dst_g, sizeof(dst_g));
+ showme16("dst_r", &dst_r, sizeof(dst_r));
+ showme8("sb", &sb, sizeof(sb));
+ showme8("sg", &sg, sizeof(sg));
+ showme8("sr", &sr, sizeof(sr));
+
+ return;
+ }
+ offset += UNROLL;
+ invocation++;
+ }
+#endif
+ dst += UNROLL;
count -= UNROLL;
- /* skip x += UNROLL, since it's unchanged mod-4 */
+ // skip x += UNROLL, since it's unchanged mod-4
} while (count >= UNROLL);
}
#undef UNROLL
- /* residuals */
+ // residuals
if (count > 0) {
DITHER_565_SCAN(y);
do {