diff options
author | reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2009-10-22 20:26:53 +0000 |
---|---|---|
committer | reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2009-10-22 20:26:53 +0000 |
commit | 522aa8d4d61f30bddbaf996b2933d5f7b1f23880 (patch) | |
tree | f668bce2e778de7301d6e2225fb9807c16f0f642 /src/core | |
parent | 8481ccc199ba3ed03acb19b1f341394db5bcb834 (diff) |
fix S32A_D565_Blend_neon
git-svn-id: http://skia.googlecode.com/svn/trunk@405 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/SkBitmapProcState_matrix_clamp.h | 110 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_matrix_repeat.h | 66 |
2 files changed, 93 insertions, 83 deletions
diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h index 565af10538..22434680f4 100644 --- a/src/core/SkBitmapProcState_matrix_clamp.h +++ b/src/core/SkBitmapProcState_matrix_clamp.h @@ -299,8 +299,8 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, #endif #if 1 - // 2009/9/30) crashes in ApiDemos - Views - Animation - 3D Transition - // 2009/10/9: reworked, seems right now + // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition + // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... * but we immediately discard the low 16 bits... @@ -389,18 +389,18 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, register int16x4_t out_y asm("d17") = yhi; asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst) - ); + : + : "w" (out_x), "w" (out_y), "r" (mydst) + ); } { register int16x4_t out_x asm("d18") = x2hi; register int16x4_t out_y asm("d19") = y2hi; asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst+8) - ); + : + : "w" (out_x), "w" (out_y), "r" (mydst+8) + ); } /* XXX: gcc isn't interleaving these with the NEON ops @@ -425,39 +425,38 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, /* for checking our NEON-produced results against vanilla code */ { int bad = (-1); - for (int i = 0; i < base_count; i++) - { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); + for (int i = 0; i < base_count; i++) { + uint32_t val; + val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | + TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - if (val != base_xy[i]) { - bad = i; - break; - } + if (val != base_xy[i]) { + bad = i; + break; + } } if (bad >= 0) { - SkDebugf ("clamp-nofilter-persp failed piece %d\n", bad); - SkDebugf (" maxX %08x maxY %08x\n", maxX, maxY); - bad -= (bad & 0x7); /* align */ - for (int i = bad; i < bad + 8; i++) { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - - SkDebugf ("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", - i, base_xy[i], val, base_srcXY[i * 2 + 0], - base_srcXY[i * 2 + 1]); - } - SkDebugf ("---\n"); + SkDebugf("clamp-nofilter-persp failed piece %d\n", bad); + SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY); + bad -= (bad & 0x7); /* align */ + for (int i = bad; i < bad + 8; i++) { + uint32_t val; + val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | + TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); + + SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", + i, base_xy[i], val, base_srcXY[i * 2 + 0], + base_srcXY[i * 2 + 1]); + } + SkDebugf ("---\n"); } if (end_xy != xy) { - SkDebugf ("xy ended at %08x, should be %08x\n", xy, end_xy); + SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy); } if (end_srcXY != srcXY) { - SkDebugf ("srcXY ended at %08x, should be %08x\n", srcXY, - end_srcXY); + SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY, + end_srcXY); } } #endif @@ -621,10 +620,6 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, int32x4_t wide_dx, wide_fx, wide_onex, wide_fx1; int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1; - /* need side-by-side registers for vst2.32 tricks */ - register int32x4_t wide_x asm("q7"); - register int32x4_t wide_y asm("q6"); - #undef AFFINE_DEBUG #if defined(AFFINE_DEBUG) SkFixed fyp = fy; @@ -649,6 +644,9 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, wide_oney = vdupq_n_s32(oneY); while (count >= 4) { + int32x4_t wide_x; + int32x4_t wide_y; + /* do the X side, then the Y side, then interleave them */ /* original expands to: @@ -698,10 +696,17 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, wide_y = vorrq_s32(wide_i, wide_fy1); /* interleave as YXYXYXYX as part of the storing */ - asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" - : - : "w" (wide_y), "w" (wide_x), "r" (xy) - ); + { + /* vst2.32 needs side-by-side registers */ + register int32x4_t t_x asm("q1"); + register int32x4_t t_y asm("q0"); + + t_x = wide_x; t_y = wide_y; + asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" + : + : "w" (t_y), "w" (t_x), "r" (xy) + ); + } #if defined(AFFINE_DEBUG) /* make sure we're good here -- check the 4 we just output */ @@ -773,22 +778,21 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s, int32x4_t wide_fy1; int32x4_t wide_x, wide_y; - /* need side-by-side regs for vld2/vst2 tricks */ - /* RBE: avoid low registers */ - register int32x4_t wide_first asm ("q6"); - register int32x4_t wide_second asm ("q7"); - while (count >= 4) { + /* need side-by-side regs for vld2/vst2 tricks */ + register int32x4_t wide_first asm ("q0"); + register int32x4_t wide_second asm ("q1"); + /* RBE: it's good, but: * -- we spill a constant that could be easily regnerated * [perhaps tweak gcc's NEON constant costs?] */ /* load src: x-y-x-y-x-y-x-y */ - asm ("vld2.32 {q6-q7},[%2] /* x=%q0 y=%q1 */" + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" : "=w" (wide_first), "=w" (wide_second) : "r" (srcXY)); - + /* immediately get into vars gcc can move around if needed */ wide_x = wide_first; wide_y = wide_second; @@ -848,16 +852,16 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s, /* switch them around; have to do it this way to get them * in the proper registers to match our instruction */ - /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */ - wide_first = wide_y; - wide_second = wide_x; - /* iteration bookkeeping, ahead of the asm() for scheduling */ srcXY += 2*4; count -= 4; /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */ - asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" + /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */ + wide_first = wide_y; + wide_second = wide_x; + + asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" : : "w" (wide_first), "w" (wide_second), "r" (xy)); diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h index 8d743a5ffc..e5309f8dea 100644 --- a/src/core/SkBitmapProcState_matrix_repeat.h +++ b/src/core/SkBitmapProcState_matrix_repeat.h @@ -95,8 +95,7 @@ static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, * but some processing of the 'fx' information * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - if (count >= 8) - { + if (count >= 8) { /* SkFixed is 16.16 fixed point */ SkFixed dx2 = dx+dx; SkFixed dx4 = dx2+dx2; @@ -224,8 +223,7 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, * but some processing of the 'fx' information * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - if (count >= 4) - { + if (count >= 4) { /* SkFixed is 16.16 fixed point */ SkFixed dx4 = dx*4; SkFixed dy4 = dy*4; @@ -257,11 +255,10 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); /* store & bump */ - do - { + do { int32x4_t xout; - int32x4_t yout; - int16x8_t hi16; + int32x4_t yout; + int16x8_t hi16; /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ /* mask to low 16 [would like to use uzp tricks) */ @@ -303,11 +300,11 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, ofx += odx; ofy += ody; } if (bad) { - SkDebugf("repeat-nofilter-affine fails\n"); - SkDebugf("count %d myi %d\n", ocount, myi); - SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", - bfx, bdx, bfy, bdy); - SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); + SkDebugf("repeat-nofilter-affine fails\n"); + SkDebugf("count %d myi %d\n", ocount, myi); + SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", + bfx, bdx, bfy, bdy); + SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); } #endif #endif @@ -362,27 +359,36 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, int32_t *mysrc = (int32_t *) srcXY; int16_t *mydst = (int16_t *) xy; do { - register int32x4_t x asm("q0"); - register int32x4_t y asm("q1"); - register int32x4_t x2 asm("q2"); - register int32x4_t y2 asm("q3"); - - int16x8_t hi; - int16x8_t hi2; + int32x4_t x, y, x2, y2; + int16x8_t hi, hi2; /* read array of x,y,x,y,x,y */ /* vld2 does the de-interleaving for us */ - /* dependent on register assignments above */ - asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" - : "=w" (x), "=w" (y) - : "r" (mysrc) - ); + /* isolate reg-bound scopes; gcc will minimize register + * motion if possible; this ensures that we don't lose + * a register across a debugging call because it happens + * to be bound into a call-clobbered register + */ + { + register int32x4_t q0 asm("q0"); + register int32x4_t q1 asm("q1"); + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" + : "=w" (q0), "=w" (q1) + : "r" (mysrc) + ); + x = q0; y = q1; + } /* offset == 256 bits == 32 bytes == 8 longs */ - asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */" - : "=w" (x2), "=w" (y2) - : "r" (mysrc+8) - ); + { + register int32x4_t q2 asm("q2"); + register int32x4_t q3 asm("q3"); + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" + : "=w" (q2), "=w" (q3) + : "r" (mysrc+8) + ); + x = q2; y = q3; + } /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ /* mask to low 16 [would like to use uzp tricks) */ @@ -405,7 +411,7 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, hi = vreinterpretq_s16_s32(y); vst1q_s16(mydst, hi); - /* and push second 8 entries out */ + /* and likewise for the second 8 entries */ y2 = vsriq_n_s32(y2, x2, 16); hi2 = vreinterpretq_s16_s32(y2); vst1q_s16(mydst+8, hi2); |