ARM Skia NEON patches - 18 - Preparation work for BitmapProcState

BitmapProcState: clean a little and get rid of some asm replacing the apparently stupid dx+dx+dx leads to more instructions being generated. Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BitmapProcState: move code common to C and NEON to a separate header Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/21931002 git-svn-id: http://skia.googlecode.com/svn/trunk@11109 2bbb7eff-a529-9590-31e7-b0007b416f81
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-09-05 18:27:57 +0000
committer: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-09-05 18:27:57 +0000
commit: a8c09668f9e602f77422a344dfa4d13155c91fd3 (patch)
tree: a0948b0f4cc33f7860154663dd8974d26be43cef /src/opts/SkBitmapProcState_matrixProcs_neon.cpp
parent: fd47a121dc65b1bf8a63943d202d3b905858129b (diff)
1 files changed, 33 insertions, 57 deletions
diff --git a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
index 7d75143e20..e81da67052 100644
--- a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
+++ b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
@@ -8,6 +8,7 @@
 #include "SkPerspIter.h"
 #include "SkShader.h"
 #include "SkUtilsArm.h"
+#include "SkBitmapProcState_utils.h"
 
 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
@@ -15,10 +16,6 @@ extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 
-static unsigned SK_USHIFT16(unsigned x) {
-    return x >> 16;
-}
-
 #define MAKENAME(suffix)        ClampX_ClampY ## suffix ## _neon
 #define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
 #define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
@@ -35,93 +32,72 @@ static unsigned SK_USHIFT16(unsigned x) {
 #include "SkBitmapProcState_matrix_repeat_neon.h"
 
 
-void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
-{
-    int i;
 
+void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
     if (count >= 8) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx2 = dx+dx;
-        SkFixed dx4 = dx2+dx2;
-        SkFixed dx8 = dx4+dx4;
+        // SkFixed is 16.16 fixed point
+        SkFixed dx8 = dx * 8;
+        int32x4_t vdx8 = vdupq_n_s32(dx8);
 
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        SkFixed fx1, fx2, fx3;
+        // setup lbase and hbase
         int32x4_t lbase, hbase;
-        uint16_t *dst16 = (uint16_t *)dst;
-
-        fx1 = fx+dx;
-        fx2 = fx1+dx;
-        fx3 = fx2+dx;
-
-        /* avoid an 'lbase unitialized' warning */
         lbase = vdupq_n_s32(fx);
-        lbase = vsetq_lane_s32(fx1, lbase, 1);
-        lbase = vsetq_lane_s32(fx2, lbase, 2);
-        lbase = vsetq_lane_s32(fx3, lbase, 3);
-        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
+        lbase = vsetq_lane_s32(fx + dx, lbase, 1);
+        lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
+        lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
+        hbase = lbase + vdupq_n_s32(4 * dx);
 
-        /* take upper 16 of each, store, and bump everything */
         do {
-            int32x4_t lout, hout;
-            uint16x8_t hi16;
-
-            lout = lbase;
-            hout = hbase;
-            /* gets hi's of all louts then hi's of all houts */
-            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
-            hi16 = vreinterpretq_u16_s32(hout);
-            vst1q_u16(dst16, hi16);
-
-            /* on to the next */
-            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
-            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
-            dst16 += 8;
+            // store the upper 16 bits
+            vst1q_u32(dst, vreinterpretq_u32_s16(
+                vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]
+            ));
+
+            // on to the next group of 8
+            lbase += vdx8;
+            hbase += vdx8;
+            dst += 4; // we did 8 elements but the result is twice smaller
             count -= 8;
             fx += dx8;
         } while (count >= 8);
-        dst = (uint32_t *) dst16;
     }
 
     uint16_t* xx = (uint16_t*)dst;
-    for (i = count; i > 0; --i) {
+    for (int i = count; i > 0; --i) {
         *xx++ = SkToU16(fx >> 16); fx += dx;
     }
 }
 
-void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
-{
+void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
     if (count >= 8) {
-        int32x4_t wide_fx;
-        int32x4_t wide_fx2;
-        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
+        SkFixed dx8 = dx * 8;
+        int32x4_t vdx8 = vdupq_n_s32(dx8);
 
+        int32x4_t wide_fx, wide_fx2;
         wide_fx = vdupq_n_s32(fx);
-        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
-        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
-        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
+        wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
+        wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
+        wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
 
-        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
+        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
 
         while (count >= 8) {
             int32x4_t wide_out;
             int32x4_t wide_out2;
 
             wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
-            wide_out = vorrq_s32(wide_out,
-            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
+            wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
 
             wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
-            wide_out2 = vorrq_s32(wide_out2,
-            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
+            wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
 
             vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
             vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
 
             dst += 8;
-            fx += dx*8;
-            wide_fx = vaddq_s32(wide_fx, wide_dx8);
-            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
+            fx += dx8;
+            wide_fx += vdx8;
+            wide_fx2 += vdx8;
             count -= 8;
         }
     }
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-09-05 18:27:57 +0000
committer	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-09-05 18:27:57 +0000
commit	a8c09668f9e602f77422a344dfa4d13155c91fd3 (patch)
tree	a0948b0f4cc33f7860154663dd8974d26be43cef /src/opts/SkBitmapProcState_matrixProcs_neon.cpp
parent	fd47a121dc65b1bf8a63943d202d3b905858129b (diff)