aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/SkBitmapProcState_matrixProcs.cpp
diff options
context:
space:
mode:
authorGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-09-15 14:10:42 +0000
committerGravatar reed@android.com <reed@android.com@2bbb7eff-a529-9590-31e7-b0007b416f81>2009-09-15 14:10:42 +0000
commited881c2704bc81fe46a68c0cf9e292287313baa6 (patch)
treeb1cc4beed571a83994f7f8100c0cba76183f850c /src/core/SkBitmapProcState_matrixProcs.cpp
parentebdeeb8a018f2df01e190fd961d68a94f0e0fcb9 (diff)
add neon opts for matrix procs
git-svn-id: http://skia.googlecode.com/svn/trunk@353 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/core/SkBitmapProcState_matrixProcs.cpp')
-rw-r--r--src/core/SkBitmapProcState_matrixProcs.cpp105
1 files changed, 100 insertions, 5 deletions
diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp
index 67b57c4418..6654312a8f 100644
--- a/src/core/SkBitmapProcState_matrixProcs.cpp
+++ b/src/core/SkBitmapProcState_matrixProcs.cpp
@@ -1,3 +1,5 @@
+/* NEON optimized code (C) COPYRIGHT 2009 Motorola */
+
#include "SkBitmapProcState.h"
#include "SkPerspIter.h"
#include "SkShader.h"
@@ -31,14 +33,22 @@ void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
#define CHECK_FOR_DECAL
-#include "SkBitmapProcState_matrix.h"
+#if defined(__ARM_HAVE_NEON)
+ #include "SkBitmapProcState_matrix_clamp.h"
+#else
+ #include "SkBitmapProcState_matrix.h"
+#endif
#define MAKENAME(suffix) RepeatX_RepeatY ## suffix
#define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
#define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)
#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#include "SkBitmapProcState_matrix.h"
+#if defined(__ARM_HAVE_NEON)
+ #include "SkBitmapProcState_matrix_repeat.h"
+#else
+ #include "SkBitmapProcState_matrix.h"
+#endif
#define MAKENAME(suffix) GeneralXY ## suffix
#define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
@@ -147,6 +157,52 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
{
int i;
+#if defined(__ARM_HAVE_NEON)
+ if (count >= 8) {
+ /* SkFixed is 16.16 fixed point */
+ SkFixed dx2 = dx+dx;
+ SkFixed dx4 = dx2+dx2;
+ SkFixed dx8 = dx4+dx4;
+
+ /* now build fx/fx+dx/fx+2dx/fx+3dx */
+ SkFixed fx1, fx2, fx3;
+ int32x2_t lower, upper;
+ int32x4_t lbase, hbase;
+ uint16_t *dst16 = (uint16_t *)dst;
+
+ fx1 = fx+dx;
+ fx2 = fx1+dx;
+ fx3 = fx2+dx;
+
+ /* avoid an 'lbase unitialized' warning */
+ lbase = vdupq_n_s32(fx);
+ lbase = vsetq_lane_s32(fx1, lbase, 1);
+ lbase = vsetq_lane_s32(fx2, lbase, 2);
+ lbase = vsetq_lane_s32(fx3, lbase, 3);
+ hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
+
+ /* take upper 16 of each, store, and bump everything */
+ do {
+ int32x4_t lout, hout;
+ uint16x8_t hi16;
+
+ lout = lbase;
+ hout = hbase;
+ /* gets hi's of all louts then hi's of all houts */
+ asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
+ hi16 = vreinterpretq_u16_s32(hout);
+ vst1q_u16(dst16, hi16);
+
+ /* on to the next */
+ lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
+ hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
+ dst16 += 8;
+ count -= 8;
+ fx += dx8;
+ } while (count >= 8);
+ dst = (uint32_t *) dst16;
+ }
+#else
for (i = (count >> 2); i > 0; --i)
{
*dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
@@ -154,16 +210,55 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
*dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
fx += dx+dx;
}
- uint16_t* xx = (uint16_t*)dst;
+ count &= 3;
+#endif
- for (i = (count & 3); i > 0; --i)
- {
+ uint16_t* xx = (uint16_t*)dst;
+ for (i = count; i > 0; --i) {
*xx++ = SkToU16(fx >> 16); fx += dx;
}
}
void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
{
+
+#if defined(__ARM_HAVE_NEON)
+ if (count >= 8) {
+ int32x4_t wide_fx;
+ int32x4_t wide_fx2;
+ int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
+
+ wide_fx = vdupq_n_s32(fx);
+ wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
+ wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
+ wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
+
+ wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
+
+ while (count >= 8) {
+ int32x4_t wide_out;
+ int32x4_t wide_out2;
+
+ wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
+ wide_out = vorrq_s32(wide_out,
+ vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
+
+ wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
+ wide_out2 = vorrq_s32(wide_out2,
+ vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
+
+ vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
+ vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
+
+ dst += 8;
+ fx += dx*8;
+ wide_fx = vaddq_s32(wide_fx, wide_dx8);
+ wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
+ count -= 8;
+ }
+ }
+#endif
+
if (count & 1)
{
SkASSERT((fx >> (16 + 14)) == 0);