aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core
diff options
context:
space:
mode:
authorGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-22 20:26:53 +0000
committerGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-22 20:26:53 +0000
commit522aa8d4d61f30bddbaf996b2933d5f7b1f23880 (patch)
treef668bce2e778de7301d6e2225fb9807c16f0f642 /src/core
parent8481ccc199ba3ed03acb19b1f341394db5bcb834 (diff)
fix S32A_D565_Blend_neon
git-svn-id: http://skia.googlecode.com/svn/trunk@405 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/core')
-rw-r--r--src/core/SkBitmapProcState_matrix_clamp.h110
-rw-r--r--src/core/SkBitmapProcState_matrix_repeat.h66
2 files changed, 93 insertions, 83 deletions
diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h
index 565af10538..22434680f4 100644
--- a/src/core/SkBitmapProcState_matrix_clamp.h
+++ b/src/core/SkBitmapProcState_matrix_clamp.h
@@ -299,8 +299,8 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
#endif
#if 1
- // 2009/9/30) crashes in ApiDemos - Views - Animation - 3D Transition
- // 2009/10/9: reworked, seems right now
+ // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition
+ // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn
/* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
* but we immediately discard the low 16 bits...
@@ -389,18 +389,18 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
register int16x4_t out_y asm("d17") = yhi;
asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst)
- );
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst)
+ );
}
{
register int16x4_t out_x asm("d18") = x2hi;
register int16x4_t out_y asm("d19") = y2hi;
asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x), "w" (out_y), "r" (mydst+8)
- );
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst+8)
+ );
}
/* XXX: gcc isn't interleaving these with the NEON ops
@@ -425,39 +425,38 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
/* for checking our NEON-produced results against vanilla code */
{
int bad = (-1);
- for (int i = 0; i < base_count; i++)
- {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+ for (int i = 0; i < base_count; i++) {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
- if (val != base_xy[i]) {
- bad = i;
- break;
- }
+ if (val != base_xy[i]) {
+ bad = i;
+ break;
+ }
}
if (bad >= 0) {
- SkDebugf ("clamp-nofilter-persp failed piece %d\n", bad);
- SkDebugf (" maxX %08x maxY %08x\n", maxX, maxY);
- bad -= (bad & 0x7); /* align */
- for (int i = bad; i < bad + 8; i++) {
- uint32_t val;
- val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
- TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
-
- SkDebugf ("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
- i, base_xy[i], val, base_srcXY[i * 2 + 0],
- base_srcXY[i * 2 + 1]);
- }
- SkDebugf ("---\n");
+ SkDebugf("clamp-nofilter-persp failed piece %d\n", bad);
+ SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY);
+ bad -= (bad & 0x7); /* align */
+ for (int i = bad; i < bad + 8; i++) {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+
+ SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
+ i, base_xy[i], val, base_srcXY[i * 2 + 0],
+ base_srcXY[i * 2 + 1]);
+ }
+ SkDebugf ("---\n");
}
if (end_xy != xy) {
- SkDebugf ("xy ended at %08x, should be %08x\n", xy, end_xy);
+ SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy);
}
if (end_srcXY != srcXY) {
- SkDebugf ("srcXY ended at %08x, should be %08x\n", srcXY,
- end_srcXY);
+ SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY,
+ end_srcXY);
}
}
#endif
@@ -621,10 +620,6 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_dx, wide_fx, wide_onex, wide_fx1;
int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1;
- /* need side-by-side registers for vst2.32 tricks */
- register int32x4_t wide_x asm("q7");
- register int32x4_t wide_y asm("q6");
-
#undef AFFINE_DEBUG
#if defined(AFFINE_DEBUG)
SkFixed fyp = fy;
@@ -649,6 +644,9 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
wide_oney = vdupq_n_s32(oneY);
while (count >= 4) {
+ int32x4_t wide_x;
+ int32x4_t wide_y;
+
/* do the X side, then the Y side, then interleave them */
/* original expands to:
@@ -698,10 +696,17 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
wide_y = vorrq_s32(wide_i, wide_fy1);
/* interleave as YXYXYXYX as part of the storing */
- asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
- :
- : "w" (wide_y), "w" (wide_x), "r" (xy)
- );
+ {
+ /* vst2.32 needs side-by-side registers */
+ register int32x4_t t_x asm("q1");
+ register int32x4_t t_y asm("q0");
+
+ t_x = wide_x; t_y = wide_y;
+ asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
+ :
+ : "w" (t_y), "w" (t_x), "r" (xy)
+ );
+ }
#if defined(AFFINE_DEBUG)
/* make sure we're good here -- check the 4 we just output */
@@ -773,22 +778,21 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_fy1;
int32x4_t wide_x, wide_y;
- /* need side-by-side regs for vld2/vst2 tricks */
- /* RBE: avoid low registers */
- register int32x4_t wide_first asm ("q6");
- register int32x4_t wide_second asm ("q7");
-
while (count >= 4) {
+ /* need side-by-side regs for vld2/vst2 tricks */
+ register int32x4_t wide_first asm ("q0");
+ register int32x4_t wide_second asm ("q1");
+
/* RBE: it's good, but:
* -- we spill a constant that could be easily regnerated
* [perhaps tweak gcc's NEON constant costs?]
*/
/* load src: x-y-x-y-x-y-x-y */
- asm ("vld2.32 {q6-q7},[%2] /* x=%q0 y=%q1 */"
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
: "=w" (wide_first), "=w" (wide_second)
: "r" (srcXY));
-
+ /* immediately get into vars gcc can move around if needed */
wide_x = wide_first;
wide_y = wide_second;
@@ -848,16 +852,16 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
/* switch them around; have to do it this way to get them
* in the proper registers to match our instruction */
- /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */
- wide_first = wide_y;
- wide_second = wide_x;
-
/* iteration bookkeeping, ahead of the asm() for scheduling */
srcXY += 2*4;
count -= 4;
/* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */
- asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
+ /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */
+ wide_first = wide_y;
+ wide_second = wide_x;
+
+ asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
:
: "w" (wide_first), "w" (wide_second), "r" (xy));
diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h
index 8d743a5ffc..e5309f8dea 100644
--- a/src/core/SkBitmapProcState_matrix_repeat.h
+++ b/src/core/SkBitmapProcState_matrix_repeat.h
@@ -95,8 +95,7 @@ static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
* but some processing of the 'fx' information
* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
*/
- if (count >= 8)
- {
+ if (count >= 8) {
/* SkFixed is 16.16 fixed point */
SkFixed dx2 = dx+dx;
SkFixed dx4 = dx2+dx2;
@@ -224,8 +223,7 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
* but some processing of the 'fx' information
* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
*/
- if (count >= 4)
- {
+ if (count >= 4) {
/* SkFixed is 16.16 fixed point */
SkFixed dx4 = dx*4;
SkFixed dy4 = dy*4;
@@ -257,11 +255,10 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
/* store & bump */
- do
- {
+ do {
int32x4_t xout;
- int32x4_t yout;
- int16x8_t hi16;
+ int32x4_t yout;
+ int16x8_t hi16;
/* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
/* mask to low 16 [would like to use uzp tricks) */
@@ -303,11 +300,11 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
ofx += odx; ofy += ody;
}
if (bad) {
- SkDebugf("repeat-nofilter-affine fails\n");
- SkDebugf("count %d myi %d\n", ocount, myi);
- SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
- bfx, bdx, bfy, bdy);
- SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
+ SkDebugf("repeat-nofilter-affine fails\n");
+ SkDebugf("count %d myi %d\n", ocount, myi);
+ SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
+ bfx, bdx, bfy, bdy);
+ SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
}
#endif
#endif
@@ -362,27 +359,36 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
int32_t *mysrc = (int32_t *) srcXY;
int16_t *mydst = (int16_t *) xy;
do {
- register int32x4_t x asm("q0");
- register int32x4_t y asm("q1");
- register int32x4_t x2 asm("q2");
- register int32x4_t y2 asm("q3");
-
- int16x8_t hi;
- int16x8_t hi2;
+ int32x4_t x, y, x2, y2;
+ int16x8_t hi, hi2;
/* read array of x,y,x,y,x,y */
/* vld2 does the de-interleaving for us */
- /* dependent on register assignments above */
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
- : "=w" (x), "=w" (y)
- : "r" (mysrc)
- );
+ /* isolate reg-bound scopes; gcc will minimize register
+ * motion if possible; this ensures that we don't lose
+ * a register across a debugging call because it happens
+ * to be bound into a call-clobbered register
+ */
+ {
+ register int32x4_t q0 asm("q0");
+ register int32x4_t q1 asm("q1");
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ : "=w" (q0), "=w" (q1)
+ : "r" (mysrc)
+ );
+ x = q0; y = q1;
+ }
/* offset == 256 bits == 32 bytes == 8 longs */
- asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"
- : "=w" (x2), "=w" (y2)
- : "r" (mysrc+8)
- );
+ {
+ register int32x4_t q2 asm("q2");
+ register int32x4_t q3 asm("q3");
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ : "=w" (q2), "=w" (q3)
+ : "r" (mysrc+8)
+ );
+ x = q2; y = q3;
+ }
/* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
/* mask to low 16 [would like to use uzp tricks) */
@@ -405,7 +411,7 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
hi = vreinterpretq_s16_s32(y);
vst1q_s16(mydst, hi);
- /* and push second 8 entries out */
+ /* and likewise for the second 8 entries */
y2 = vsriq_n_s32(y2, x2, 16);
hi2 = vreinterpretq_s16_s32(y2);
vst1q_s16(mydst+8, hi2);