aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-22 20:26:53 +0000
committerGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-22 20:26:53 +0000
commit522aa8d4d61f30bddbaf996b2933d5f7b1f23880 (patch)
treef668bce2e778de7301d6e2225fb9807c16f0f642
parent8481ccc199ba3ed03acb19b1f341394db5bcb834 (diff)
fix S32A_D565_Blend_neon
git-svn-id: http://skia.googlecode.com/svn/trunk@405 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--src/core/SkBitmapProcState_matrix_clamp.h110
-rw-r--r--src/core/SkBitmapProcState_matrix_repeat.h66
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp291
3 files changed, 312 insertions, 155 deletions
diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h
index 565af10538..22434680f4 100644
--- a/src/core/SkBitmapProcState_matrix_clamp.h
+++ b/src/core/SkBitmapProcState_matrix_clamp.h
@@ -299,8 +299,8 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
#endif
#if 1
- // 2009/9/30) crashes in ApiDemos - Views - Animation - 3D Transition
- // 2009/10/9: reworked, seems right now
+ // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition
+ // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn
/* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
* but we immediately discard the low 16 bits...
@@ -389,18 +389,18 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
register int16x4_t out_y asm("d17") = yhi;
asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst)
- );
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst)
+ );
}
{
register int16x4_t out_x asm("d18") = x2hi;
register int16x4_t out_y asm("d19") = y2hi;
asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst+8)
- );
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst+8)
+ );
}
/* XXX: gcc isn't interleaving these with the NEON ops
@@ -425,39 +425,38 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
/* for checking our NEON-produced results against vanilla code */
{
int bad = (-1);
- for (int i = 0; i < base_count; i++)
- {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+ for (int i = 0; i < base_count; i++) {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
- if (val != base_xy[i]) {
- bad = i;
- break;
- }
+ if (val != base_xy[i]) {
+ bad = i;
+ break;
+ }
}
if (bad >= 0) {
- SkDebugf ("clamp-nofilter-persp failed piece %d\n", bad);
- SkDebugf (" maxX %08x maxY %08x\n", maxX, maxY);
- bad -= (bad & 0x7); /* align */
- for (int i = bad; i < bad + 8; i++) {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
- SkDebugf ("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
- i, base_xy[i], val, base_srcXY[i * 2 + 0],
- base_srcXY[i * 2 + 1]);
- }
- SkDebugf ("---\n");
+ SkDebugf("clamp-nofilter-persp failed piece %d\n", bad);
+ SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY);
+ bad -= (bad & 0x7); /* align */
+ for (int i = bad; i < bad + 8; i++) {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+
+ SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
+ i, base_xy[i], val, base_srcXY[i * 2 + 0],
+ base_srcXY[i * 2 + 1]);
+ }
+ SkDebugf ("---\n");
}
if (end_xy != xy) {
- SkDebugf ("xy ended at %08x, should be %08x\n", xy, end_xy);
+ SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy);
}
if (end_srcXY != srcXY) {
- SkDebugf ("srcXY ended at %08x, should be %08x\n", srcXY,
- end_srcXY);
+ SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY,
+ end_srcXY);
}
}
#endif
@@ -621,10 +620,6 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_dx, wide_fx, wide_onex, wide_fx1;
int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1;
- /* need side-by-side registers for vst2.32 tricks */
- register int32x4_t wide_x asm("q7");
- register int32x4_t wide_y asm("q6");
-
#undef AFFINE_DEBUG
#if defined(AFFINE_DEBUG)
SkFixed fyp = fy;
@@ -649,6 +644,9 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
wide_oney = vdupq_n_s32(oneY);
while (count >= 4) {
+ int32x4_t wide_x;
+ int32x4_t wide_y;
+
/* do the X side, then the Y side, then interleave them */
/* original expands to:
@@ -698,10 +696,17 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
wide_y = vorrq_s32(wide_i, wide_fy1);
/* interleave as YXYXYXYX as part of the storing */
- asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
- :
- : "w" (wide_y), "w" (wide_x), "r" (xy)
- );
+ {
+ /* vst2.32 needs side-by-side registers */
+ register int32x4_t t_x asm("q1");
+ register int32x4_t t_y asm("q0");
+
+ t_x = wide_x; t_y = wide_y;
+ asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
+ :
+ : "w" (t_y), "w" (t_x), "r" (xy)
+ );
+ }
#if defined(AFFINE_DEBUG)
/* make sure we're good here -- check the 4 we just output */
@@ -773,22 +778,21 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_fy1;
int32x4_t wide_x, wide_y;
- /* need side-by-side regs for vld2/vst2 tricks */
- /* RBE: avoid low registers */
- register int32x4_t wide_first asm ("q6");
- register int32x4_t wide_second asm ("q7");
-
while (count >= 4) {
+ /* need side-by-side regs for vld2/vst2 tricks */
+ register int32x4_t wide_first asm ("q0");
+ register int32x4_t wide_second asm ("q1");
+
/* RBE: it's good, but:
* -- we spill a constant that could be easily regnerated
* [perhaps tweak gcc's NEON constant costs?]
*/
/* load src: x-y-x-y-x-y-x-y */
- asm ("vld2.32 {q6-q7},[%2] /* x=%q0 y=%q1 */"
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
: "=w" (wide_first), "=w" (wide_second)
: "r" (srcXY));
-
+ /* immediately get into vars gcc can move around if needed */
wide_x = wide_first;
wide_y = wide_second;
@@ -848,16 +852,16 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
/* switch them around; have to do it this way to get them
* in the proper registers to match our instruction */
- /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */
- wide_first = wide_y;
- wide_second = wide_x;
-
/* iteration bookkeeping, ahead of the asm() for scheduling */
srcXY += 2*4;
count -= 4;
/* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */
- asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
+ /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */
+ wide_first = wide_y;
+ wide_second = wide_x;
+
+ asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
:
: "w" (wide_first), "w" (wide_second), "r" (xy));
diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h
index 8d743a5ffc..e5309f8dea 100644
--- a/src/core/SkBitmapProcState_matrix_repeat.h
+++ b/src/core/SkBitmapProcState_matrix_repeat.h
@@ -95,8 +95,7 @@ static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
* but some processing of the 'fx' information
* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
*/
- if (count >= 8)
- {
+ if (count >= 8) {
/* SkFixed is 16.16 fixed point */
SkFixed dx2 = dx+dx;
SkFixed dx4 = dx2+dx2;
@@ -224,8 +223,7 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
* but some processing of the 'fx' information
* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
*/
- if (count >= 4)
- {
+ if (count >= 4) {
/* SkFixed is 16.16 fixed point */
SkFixed dx4 = dx*4;
SkFixed dy4 = dy*4;
@@ -257,11 +255,10 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
/* store & bump */
- do
- {
+ do {
int32x4_t xout;
- int32x4_t yout;
- int16x8_t hi16;
+ int32x4_t yout;
+ int16x8_t hi16;
/* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
/* mask to low 16 [would like to use uzp tricks) */
@@ -303,11 +300,11 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
ofx += odx; ofy += ody;
}
if (bad) {
- SkDebugf("repeat-nofilter-affine fails\n");
- SkDebugf("count %d myi %d\n", ocount, myi);
- SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
- bfx, bdx, bfy, bdy);
- SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
+ SkDebugf("repeat-nofilter-affine fails\n");
+ SkDebugf("count %d myi %d\n", ocount, myi);
+ SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
+ bfx, bdx, bfy, bdy);
+ SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
}
#endif
#endif
@@ -362,27 +359,36 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
int32_t *mysrc = (int32_t *) srcXY;
int16_t *mydst = (int16_t *) xy;
do {
- register int32x4_t x asm("q0");
- register int32x4_t y asm("q1");
- register int32x4_t x2 asm("q2");
- register int32x4_t y2 asm("q3");
-
- int16x8_t hi;
- int16x8_t hi2;
+ int32x4_t x, y, x2, y2;
+ int16x8_t hi, hi2;
/* read array of x,y,x,y,x,y */
/* vld2 does the de-interleaving for us */
- /* dependent on register assignments above */
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
- : "=w" (x), "=w" (y)
- : "r" (mysrc)
- );
+ /* isolate reg-bound scopes; gcc will minimize register
+ * motion if possible; this ensures that we don't lose
+ * a register across a debugging call because it happens
+ * to be bound into a call-clobbered register
+ */
+ {
+ register int32x4_t q0 asm("q0");
+ register int32x4_t q1 asm("q1");
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ : "=w" (q0), "=w" (q1)
+ : "r" (mysrc)
+ );
+ x = q0; y = q1;
+ }
/* offset == 256 bits == 32 bytes == 8 longs */
- asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"
- : "=w" (x2), "=w" (y2)
- : "r" (mysrc+8)
- );
+ {
+ register int32x4_t q2 asm("q2");
+ register int32x4_t q3 asm("q3");
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ : "=w" (q2), "=w" (q3)
+ : "r" (mysrc+8)
+ );
+ x = q2; y = q3;
+ }
/* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
/* mask to low 16 [would like to use uzp tricks) */
@@ -405,7 +411,7 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
hi = vreinterpretq_s16_s32(y);
vst1q_s16(mydst, hi);
- /* and push second 8 entries out */
+ /* and likewise for the second 8 entries */
y2 = vsriq_n_s32(y2, x2, 16);
hi2 = vreinterpretq_s16_s32(y2);
vst1q_s16(mydst+8, hi2);
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 89b2e91c4d..506c4d407e 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -104,8 +104,9 @@ static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
"d30","d31"
);
- } else {
- // handle count < 8
+ }
+ else
+ { // handle count < 8
uint16_t* SK_RESTRICT keep_dst;
asm volatile (
@@ -427,20 +428,17 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
SkASSERT(255 == alpha);
if (count > 0) {
+
+ uint8x8_t alpha_mask;
+
+ static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
+ alpha_mask = vld1_u8(alpha_mask_setup);
+
/* do the NEON unrolled code */
#define UNROLL 4
while (count >= UNROLL) {
uint8x8_t src_raw, dst_raw, dst_final;
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
- uint8x8_t alpha_mask;
-
- /* use vtbl, with src_raw as the table */
- /* expect gcc to hoist alpha_mask setup above loop */
- alpha_mask = vdup_n_u8(3);
- alpha_mask = vset_lane_u8(7, alpha_mask, 4);
- alpha_mask = vset_lane_u8(7, alpha_mask, 5);
- alpha_mask = vset_lane_u8(7, alpha_mask, 6);
- alpha_mask = vset_lane_u8(7, alpha_mask, 7);
/* get the source */
src_raw = vreinterpret_u8_u32(vld1_u32(src));
@@ -454,7 +452,6 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
#endif
-#if 1
/* 1st and 2nd bits of the unrolling */
{
uint8x8_t dst_cooked;
@@ -464,12 +461,10 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
/* get the alphas spread out properly */
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
- alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow);
- alpha_wide = vmovl_u8(alpha_narrow);
+ alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
- /* get the dest, spread it */
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
+ /* spread the dest */
dst_wide = vmovl_u8(dst_raw);
/* alpha mul the dest */
@@ -479,7 +474,6 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
/* sum -- ignoring any byte lane overflows */
dst_final = vadd_u8(src_raw, dst_cooked);
}
-#endif
#if UNROLL > 2
/* the 3rd and 4th bits of our unrolling */
@@ -490,11 +484,10 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
uint16x8_t alpha_wide;
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
- alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow);
- alpha_wide = vmovl_u8(alpha_narrow);
+ alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
- /* get the dest, spread it */
+ /* spread the dest */
dst_wide = vmovl_u8(dst_raw_2);
/* alpha mul the dest */
@@ -544,7 +537,7 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
#endif
/* Neon version of S32_Blend_BlitRow32()
- * portable version is in core/SkBlitRow_D32.cpp
+ * portable version is in src/core/SkBlitRow_D32.cpp
*/
#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
@@ -578,18 +571,17 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
/* get 64 bits of src, widen it, multiply by src_scale */
src_raw = vreinterpret_u8_u32(vld1_u32(src));
src_wide = vmovl_u8(src_raw);
- /* gcc hoists vdupq_n_u16(), better code than vmulq_n_u16() */
+ /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
/* ditto with dst */
dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
dst_wide = vmovl_u8(dst_raw);
- dst_wide = vmulq_u16 (dst_wide, vdupq_n_u16(dst_scale));
- /* sum (knowing it won't overflow 16 bits) and take high bits */
- dst_wide = vaddq_u16(dst_wide, src_wide);
- dst_final = vshrn_n_u16(dst_wide, 8);
+ /* combine add with dst multiply into mul-accumulate */
+ dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
+ dst_final = vshrn_n_u16(dst_wide, 8);
vst1_u32(dst, vreinterpret_u32_u8(dst_final));
src += UNROLL;
@@ -630,73 +622,234 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
///////////////////////////////////////////////////////////////////////////////
#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
-/* RBE: working on this 2009/10/8 */
-static void S32A_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
+
+#undef DEBUG_OPAQUE_DITHER
+
+#if defined(DEBUG_OPAQUE_DITHER)
+static void showme8(char *str, void *p, int len)
+{
+ static char buf[256];
+ char tbuf[32];
+ int i;
+ char *pc = (char*) p;
+ sprintf(buf,"%8s:", str);
+ for(i=0;i<len;i++) {
+ sprintf(tbuf, " %02x", pc[i]);
+ strcat(buf, tbuf);
+ }
+ SkDebugf("%s\n", buf);
+}
+static void showme16(char *str, void *p, int len)
+{
+ static char buf[256];
+ char tbuf[32];
+ int i;
+ uint16_t *pc = (uint16_t*) p;
+ sprintf(buf,"%8s:", str);
+ len = (len / sizeof(uint16_t)); /* passed as bytes */
+ for(i=0;i<len;i++) {
+ sprintf(tbuf, " %04x", pc[i]);
+ strcat(buf, tbuf);
+ }
+ SkDebugf("%s\n", buf);
+}
+#endif
+
+static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha, int x, int y) {
SkASSERT(255 == alpha);
-
+
+#define UNROLL 8
+
+ if (count >= UNROLL) {
+ uint8x8_t dbase;
+
+#if defined(DEBUG_OPAQUE_DITHER)
+ uint16_t tmpbuf[UNROLL];
+ int td[UNROLL];
+ int tdv[UNROLL];
+ int ta[UNROLL];
+ int tap[UNROLL];
+ uint16_t in_dst[UNROLL];
+ int offset = 0;
+ int noisy = 0;
+#endif
+
+ const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
+ dbase = vld1_u8(dstart);
+
+ do {
+ uint8x8_t sr, sg, sb, sa, d;
+ uint16x8_t dst8, scale8, alpha;
+ uint16x8_t dst_r, dst_g, dst_b;
+
+#if defined(DEBUG_OPAQUE_DITHER)
+ /* calculate 8 elements worth into a temp buffer */
+ {
+ int my_y = y;
+ int my_x = x;
+ SkPMColor* my_src = (SkPMColor*)src;
+ uint16_t* my_dst = dst;
+ int i;
+
+ DITHER_565_SCAN(my_y);
+ for(i=0;i<UNROLL;i++) {
+ SkPMColor c = *my_src++;
+ SkPMColorAssert(c);
+ if (c) {
+ unsigned a = SkGetPackedA32(c);
+
+ int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
+ tdv[i] = DITHER_VALUE(my_x);
+ ta[i] = a;
+ tap[i] = SkAlpha255To256(a);
+ td[i] = d;
+
+ unsigned sr = SkGetPackedR32(c);
+ unsigned sg = SkGetPackedG32(c);
+ unsigned sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32_FOR_565(sr, d);
+ sg = SkDITHER_G32_FOR_565(sg, d);
+ sb = SkDITHER_B32_FOR_565(sb, d);
+
+ uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+ uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
+ dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+ // now src and dst expanded are in g:11 r:10 x:1 b:10
+ tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+ td[i] = d;
+
+ } else {
+ tmpbuf[i] = *my_dst;
+ ta[i] = tdv[i] = td[i] = 0xbeef;
+ }
+ in_dst[i] = *my_dst;
+ my_dst += 1;
+ DITHER_INC_X(my_x);
+ }
+ }
+#endif
+
+ /* source is in ABGR */
+ {
+ register uint8x8_t d0 asm("d0");
+ register uint8x8_t d1 asm("d1");
+ register uint8x8_t d2 asm("d2");
+ register uint8x8_t d3 asm("d3");
+
+ asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
+ : "r" (src)
+ );
+ sr = d0; sg = d1; sb = d2; sa = d3;
+ }
+
+ /* calculate 'd', which will be 0..7 */
+ /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
+ alpha = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
+ alpha = vmulq_u16(alpha, vmovl_u8(dbase));
+ d = vshrn_n_u16(alpha, 8); /* narrowing too */
+
+ /* sr = sr - (sr>>5) + d */
+ /* watching for 8-bit overflow. d is 0..7; risky range of
+ * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
+ * safe as long as we do ((sr-sr>>5) + d) */
+ sr = vsub_u8(sr, vshr_n_u8(sr, 5));
+ sr = vadd_u8(sr, d);
+
+ /* sb = sb - (sb>>5) + d */
+ sb = vsub_u8(sb, vshr_n_u8(sb, 5));
+ sb = vadd_u8(sb, d);
+
+ /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+ sg = vsub_u8(sg, vshr_n_u8(sg, 6));
+ sg = vadd_u8(sg, vshr_n_u8(d,1));
+
+ /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
+ dst8 = vld1q_u16(dst);
+ dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
+ dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
+ dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
+
+ /* blend */
+ scale8 = vsubw_u8(vdupq_n_u16(255), sa);
+ scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
+ scale8 = vshrq_n_u16(scale8, 3);
+ dst_b = vmulq_u16(dst_b, scale8);
+ dst_g = vmulq_u16(dst_g, scale8);
+ dst_r = vmulq_u16(dst_r, scale8);
+
+ /* combine */
+ /* NB: vshll widens, need to preserve those bits */
+ dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
+ dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
+ dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
+
+ /* repack to store */
+ dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
+ dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
+ dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
+
+ vst1q_u16(dst, dst8);
+
+#if defined(DEBUG_OPAQUE_DITHER)
+ /* verify my 8 elements match the temp buffer */
+ {
+ int i, bad=0;
+ static int invocation;
+
+ for (i=0;i<UNROLL;i++)
+ if (tmpbuf[i] != dst[i]) bad=1;
+ if (invocation < 10 && offset < 32 && bad) {
+ SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
+ invocation, offset);
+ for (i=0;i<UNROLL;i++)
+ SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
+ i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
+ dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
+
+ /* cop out */
+ return;
+ }
+ offset += UNROLL;
+ invocation++;
+ }
+#endif
+
+ dst += UNROLL;
+ src += UNROLL;
+ count -= UNROLL;
+ /* skip x += UNROLL, since it's unchanged mod-4 */
+ } while (count >= UNROLL);
+ }
+#undef UNROLL
+
+ /* residuals */
if (count > 0) {
DITHER_565_SCAN(y);
do {
SkPMColor c = *src++;
SkPMColorAssert(c);
- /* RBE: make sure we don't generate wrong output if c==0 */
if (c) {
-
- /* let's do a vld4 to get 64 bits (8 bytes) of each Argb */
- /* so we'll have 8 a's, 8 r's, etc */
- /* little endian: ABGR is the ordering (R at lsb) */
unsigned a = SkGetPackedA32(c);
- // RBE: could load a table and do vtbl for these things
- // DITHER_VALUE() masks x to 3 bits [0..7] before lookup, so can
- // so 8x unrolling gets us perfectly aligned.
- // and we could even avoid the vtbl at that point
- /* d is 0..7 according to skia/core/SkDither.h asserts */
int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
unsigned sr = SkGetPackedR32(c);
unsigned sg = SkGetPackedG32(c);
unsigned sb = SkGetPackedB32(c);
-
- /* R and B handled identically; G is a little different */
-
- /* sr - (sr>>5) means that +d can NOT overflow */
- /* do (sr-(sr>>5)), followed by adding d -- stay in 8 bits */
- /* sr = sr+d - (sr>>5) */
sr = SkDITHER_R32_FOR_565(sr, d);
- /* calculate sr+(sr>>5) here, then add d */
-
- /* sg = sg + (d>>1) - (sg>>6) */
sg = SkDITHER_G32_FOR_565(sg, d);
- /* sg>>6 could be '3' and d>>1 is <= 3, so we're ok */
- /* calculate sg-(sg>>6), then add "d>>1" */
-
-
- /* sb = sb+d - (sb>>5) */
sb = SkDITHER_B32_FOR_565(sb, d);
- /* calculate sb+(sb>>5) here, then add d */
-
- /* been dealing in 8x8 through here; gonna have to go to 8x16 */
-
- /* need to pick up 8 dst's -- at 16 bits each, 256 bits */
- /* extract dst into 8x16's */
- /* blend */
- /* shift */
- /* reassemble */
-
uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
uint32_t dst_expanded = SkExpand_rgb_16(*dst);
-
- // would be shifted by 8, but the >>3 makes it be just 5
dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
// now src and dst expanded are in g:11 r:10 x:1 b:10
*dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
}
dst += 1;
- /* RBE: a NOP with wide enough unrolling; wide_enough == 8 */
DITHER_INC_X(x);
} while (--count != 0);
}
@@ -714,13 +867,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = {
S32_D565_Opaque_PROC,
S32_D565_Blend_PROC,
S32A_D565_Opaque_PROC,
-#if 0
- // when the src-pixel is 0 (transparent), we are still affecting the dst
- // so we're skipping this optimization for now
S32A_D565_Blend_PROC,
-#else
- NULL,
-#endif
// dither
NULL, // S32_D565_Opaque_Dither,