aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-19 19:56:37 +0000
committerGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-10-19 19:56:37 +0000
commit2b524f9b8e0aaeab95e31fc5459ad902e39b2399 (patch)
tree0e2b2823114f3d96764317e691cf8ee5ff603af6
parentfdb5353a693f518f08efe19251176026a072c7a8 (diff)
enable neon opts from motorola for perspective
git-svn-id: http://skia.googlecode.com/svn/trunk@395 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r--src/core/SkBitmapProcState_matrix_clamp.h192
-rw-r--r--src/core/SkBitmapProcState_matrix_repeat.h14
2 files changed, 145 insertions, 61 deletions
diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h
index a0d26cdafe..565af10538 100644
--- a/src/core/SkBitmapProcState_matrix_clamp.h
+++ b/src/core/SkBitmapProcState_matrix_clamp.h
@@ -36,12 +36,12 @@
/* SkClampMax(val,max) -- bound to 0..max */
-#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
-#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
-#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
-#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
+#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale_neon)
+#define SCALE_FILTER_NAME MAKENAME(_filter_scale_neon)
+#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine_neon)
+#define AFFINE_FILTER_NAME MAKENAME(_filter_affine_neon)
+#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp_neon)
+#define PERSP_FILTER_NAME MAKENAME(_filter_persp_neon)
#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
@@ -270,6 +270,8 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
}
}
+#undef DEBUG_PERSP_NOFILTER
+
static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
uint32_t* SK_RESTRICT xy,
int count, int x, int y) {
@@ -286,8 +288,19 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
while ((count = iter.next()) != 0) {
const SkFixed* SK_RESTRICT srcXY = iter.getXY();
-#if 0
- // crashes in ApiDemos - Views - Animation - 3D Transition
+
+#if defined(DEBUG_PERSP_NOFILTER)
+ /* debugging stuff */
+ const SkFixed *end_srcXY = srcXY + (count*2);
+ uint32_t *end_xy = xy + (count);
+ const SkFixed *base_srcXY = srcXY;
+ uint32_t *base_xy = xy;
+ int base_count = count;
+#endif
+
+#if 1
+ // 2009/9/30) crashes in ApiDemos - Views - Animation - 3D Transition
+ // 2009/10/9: reworked, seems right now
/* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
* but we immediately discard the low 16 bits...
@@ -301,68 +314,94 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
int16x4_t maxX4 = vdup_n_s16((int16_t)maxX);
int16x4_t maxY4 = vdup_n_s16((int16_t)maxY);
int16x4_t zero4 = vdup_n_s16(0);
- do {
- register int16x4_t xlo asm("d0");
- register int16x4_t xhi asm("d1");
- register int16x4_t ylo asm("d2");
- register int16x4_t yhi asm("d3");
- register int16x4_t x2lo asm("d4");
- register int16x4_t x2hi asm("d5");
- register int16x4_t y2lo asm("d6");
- register int16x4_t y2hi asm("d7");
- register int16x4_t out_xhi asm("d8");
- register int16x4_t out_yhi asm("d9");
- register int16x4_t out_x2hi asm("d10");
- register int16x4_t out_y2hi asm("d11");
+ /* The constructs with local blocks for register assignments
+ * and asm() instructions is to make keep any hard register
+ * assignments to as small a scope as possible. and to avoid
+ * burning call-preserved hard registers on the vld/vst
+ * instructions.
+ */
+ do {
+ int16x4_t xlo, xhi, ylo, yhi;
+ int16x4_t x2lo, x2hi, y2lo, y2hi;
/* vld4 does the de-interleaving for us */
- /* dependent on register assignments above */
- asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
- : "=w" (xlo), "=w" (xhi), "=w" (ylo), "=w" (yhi)
- : "r" (mysrc)
- );
+ {
+ register int16x4_t t_xlo asm("d0");
+ register int16x4_t t_xhi asm("d1");
+ register int16x4_t t_ylo asm("d2");
+ register int16x4_t t_yhi asm("d3");
- /* offset == 256 bits == 32 bytes == 8 longs */
- asm ("vld4.16 {d4-d7},[%4,#32] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
- : "=w" (x2lo), "=w" (x2hi), "=w" (y2lo), "=w" (y2hi)
- : "r" (mysrc)
- );
-
- /* clamp the first 4 here */
+ asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
+ : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
+ : "r" (mysrc)
+ );
+ xlo = t_xlo;
+ xhi = t_xhi;
+ ylo = t_ylo;
+ yhi = t_yhi;
+ }
/* clamp X>>16 (aka xhi) to 0..maxX */
xhi = vmax_s16(xhi, zero4); /* now 0.. */
- out_xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */
+ xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */
/* clamp Y>>16 (aka yhi) to 0..maxY */
yhi = vmax_s16(yhi, zero4); /* now 0.. */
- out_yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */
+ yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */
+
+ /* deal with the second set of numbers */
+ {
+ register int16x4_t t_xlo asm("d4");
+ register int16x4_t t_xhi asm("d5");
+ register int16x4_t t_ylo asm("d6");
+ register int16x4_t t_yhi asm("d7");
+
+ /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */
+ asm ("vld4.16 {d4-d7},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */"
+ : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi)
+ : "r" (mysrc+16)
+ );
+ x2lo = t_xlo;
+ x2hi = t_xhi;
+ y2lo = t_ylo;
+ y2hi = t_yhi;
+ }
/* clamp the second 4 here */
+ if (0) { extern void rbe(void); rbe(); }
+
/* clamp X>>16 (aka xhi) to 0..maxX */
x2hi = vmax_s16(x2hi, zero4); /* now 0.. */
- out_x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */
+ x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */
/* clamp Y>>16 (aka yhi) to 0..maxY */
y2hi = vmax_s16(y2hi, zero4); /* now 0.. */
- out_y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */
+ y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */
/* we're storing as {x,y}s: x is [0], y is [1] */
/* we'll use vst2 to make this happen */
- /* XXX: could use auto increment! */
- asm ("vst2.16 {d8-d9},[%2] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_xhi), "w" (out_yhi), "r" (mydst)
- );
- /* offset == 16 bytes == 8 shorts */
- asm ("vst2.16 {d10-d11},[%2,#16] /* xlo=%P0 xhi=%P1 */"
- :
- : "w" (out_x2hi), "w" (out_y2hi), "r" (mydst)
- );
+ {
+ register int16x4_t out_x asm("d16") = xhi;
+ register int16x4_t out_y asm("d17") = yhi;
+
+ asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */"
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst)
+ );
+ }
+ {
+ register int16x4_t out_x asm("d18") = x2hi;
+ register int16x4_t out_y asm("d19") = y2hi;
+
+ asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */"
+ :
+ : "w" (out_x), "w" (out_y), "r" (mydst+8)
+ );
+ }
/* XXX: gcc isn't interleaving these with the NEON ops
* but i think that all the scoreboarding works out */
@@ -375,14 +414,58 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
xy = (uint32_t *) mydst;
}
#endif
+
while (--count >= 0) {
*xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
TILEX_PROCF(srcXY[0], maxX);
srcXY += 2;
}
+
+#if defined(DEBUG_PERSP_NOFILTER)
+ /* for checking our NEON-produced results against vanilla code */
+ {
+ int bad = (-1);
+ for (int i = 0; i < base_count; i++)
+ {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+
+ if (val != base_xy[i]) {
+ bad = i;
+ break;
+ }
+ }
+ if (bad >= 0) {
+ SkDebugf ("clamp-nofilter-persp failed piece %d\n", bad);
+ SkDebugf (" maxX %08x maxY %08x\n", maxX, maxY);
+ bad -= (bad & 0x7); /* align */
+ for (int i = bad; i < bad + 8; i++) {
+ uint32_t val;
+ val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) |
+ TILEX_PROCF (base_srcXY[i * 2 + 0], maxX);
+
+ SkDebugf ("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n",
+ i, base_xy[i], val, base_srcXY[i * 2 + 0],
+ base_srcXY[i * 2 + 1]);
+ }
+ SkDebugf ("---\n");
+ }
+
+ if (end_xy != xy) {
+ SkDebugf ("xy ended at %08x, should be %08x\n", xy, end_xy);
+ }
+ if (end_srcXY != srcXY) {
+ SkDebugf ("srcXY ended at %08x, should be %08x\n", srcXY,
+ end_srcXY);
+ }
+ }
+#endif
}
}
+#undef DEBUG_PERSP_NOFILTER
+
//////////////////////////////////////////////////////////////////////////////
static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
@@ -539,8 +622,8 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1;
/* need side-by-side registers for vst2.32 tricks */
- register int32x4_t wide_x asm("q1");
- register int32x4_t wide_y asm("q0");
+ register int32x4_t wide_x asm("q7");
+ register int32x4_t wide_y asm("q6");
#undef AFFINE_DEBUG
#if defined(AFFINE_DEBUG)
@@ -615,7 +698,7 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
wide_y = vorrq_s32(wide_i, wide_fy1);
/* interleave as YXYXYXYX as part of the storing */
- asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
+ asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
:
: "w" (wide_y), "w" (wide_x), "r" (xy)
);
@@ -691,8 +774,9 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
int32x4_t wide_x, wide_y;
/* need side-by-side regs for vld2/vst2 tricks */
- register int32x4_t wide_first asm ("q0");
- register int32x4_t wide_second asm ("q1");
+ /* RBE: avoid low registers */
+ register int32x4_t wide_first asm ("q6");
+ register int32x4_t wide_second asm ("q7");
while (count >= 4) {
/* RBE: it's good, but:
@@ -701,7 +785,7 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
*/
/* load src: x-y-x-y-x-y-x-y */
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ asm ("vld2.32 {q6-q7},[%2] /* x=%q0 y=%q1 */"
: "=w" (wide_first), "=w" (wide_second)
: "r" (srcXY));
@@ -773,7 +857,7 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
count -= 4;
/* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */
- asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */"
+ asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */"
:
: "w" (wide_first), "w" (wide_second), "r" (xy));
diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h
index 2697db6a9f..8d743a5ffc 100644
--- a/src/core/SkBitmapProcState_matrix_repeat.h
+++ b/src/core/SkBitmapProcState_matrix_repeat.h
@@ -34,11 +34,11 @@
/* SkClampMax(val,max) -- bound to 0..max */
-#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
+#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale_neon)
#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
-#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
+#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine_neon)
#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
-#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
+#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp_neon)
#define PERSP_FILTER_NAME MAKENAME(_filter_persp)
#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
@@ -373,15 +373,15 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
/* read array of x,y,x,y,x,y */
/* vld2 does the de-interleaving for us */
/* dependent on register assignments above */
- asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
+ asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
: "=w" (x), "=w" (y)
: "r" (mysrc)
);
/* offset == 256 bits == 32 bytes == 8 longs */
- asm ("vld2.32 {q2-q3},[%2,#32] /* x=%q0 y=%q1 */"
+ asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"
: "=w" (x2), "=w" (y2)
- : "r" (mysrc)
+ : "r" (mysrc+8)
);
/* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
@@ -405,7 +405,7 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
hi = vreinterpretq_s16_s32(y);
vst1q_s16(mydst, hi);
- /* and get second 8 bytes out */
+ /* and push second 8 entries out */
y2 = vsriq_n_s32(y2, x2, 16);
hi2 = vreinterpretq_s16_s32(y2);
vst1q_s16(mydst+8, hi2);