arm: dynamic NEON support for SkBitmapProcState matrix operations.

This patch implements dynamic ARM NEON support for the functions implemented by src/core/SkBitmapProcState_matrixProcs.cpp. - Because the SkBitmapProcState_matrix_{clamp,repeat}.h headers are NEON-specific, they are renamed with a _neon.h suffix, and moved to src/opts/ (from src/core/) - Add a new file src/opts/SkBitmapProcState_matrixProcs_neon.cpp which implements the NEON code paths for all builds, and add it to the 'opts_neon' static library. - Modify SkBitmapProcState_matrixProcs.cpp to select the right code-path depending on our build configuration. Note that in the case where 'arm_neon == 1', we do not embed regular ARM code paths in the final binary. Only 'arm_neon_optional == 1' builds will contain both regular and NEON code paths at the same time. Note that there doesn't seem to be a simple way to put the NEON-specific selection from that currently is in SkBitmapProcState_matrixProcs.cpp into src/opts/. Doing so would require much more drastic restructuring. This is also true of the other SkBitmapProcState source files that will be touched in a future patch. Review URL: https://codereview.appspot.com/6453065 git-svn-id: http://skia.googlecode.com/svn/trunk@4888 2bbb7eff-a529-9590-31e7-b0007b416f81
author: digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> 2012-08-01 14:25:07 +0000
committer: digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> 2012-08-01 14:25:07 +0000
commit: fce02aca62525c3041226501574f740f7ea3714b (patch)
tree: 9053a19f5850210917e47ef54c778ccd5cae686d /src/core/SkBitmapProcState_matrixProcs.cpp
parent: 47de6787a5aa677157fd468c5798eeb01c6c9139 (diff)
1 files changed, 35 insertions, 93 deletions
diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp
index 1e12f9a921..77c6200d03 100644
--- a/src/core/SkBitmapProcState_matrixProcs.cpp
+++ b/src/core/SkBitmapProcState_matrixProcs.cpp
@@ -8,6 +8,7 @@
 #include "SkPerspIter.h"
 #include "SkShader.h"
 #include "SkUtils.h"
+#include "SkUtilsArm.h"
 
 // Helper to ensure that when we shift down, we do it w/o sign-extension
 // so the caller doesn't have to manually mask off the top 16 bits
@@ -67,27 +68,31 @@ static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX,
 void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
 
+// Compile neon code paths if needed
+#if !SK_ARM_NEON_IS_NONE
+
+// These are defined in src/opts/SkBitmapProcState_matrixProcs_neon.cpp
+extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
+extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
+
+#endif // !SK_ARM_NEON_IS_NONE
+
+// Compile non-neon code path if needed
+#if !SK_ARM_NEON_IS_ALWAYS
 #define MAKENAME(suffix)        ClampX_ClampY ## suffix
 #define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
 #define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
 #define CHECK_FOR_DECAL
-#if	defined(__ARM_HAVE_NEON)
-    #include "SkBitmapProcState_matrix_clamp.h"
-#else
-    #include "SkBitmapProcState_matrix.h"
-#endif
+#include "SkBitmapProcState_matrix.h"
 
 #define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
 #define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
 #define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
-#if	defined(__ARM_HAVE_NEON)
-    #include "SkBitmapProcState_matrix_repeat.h"
-#else
-    #include "SkBitmapProcState_matrix.h"
+#include "SkBitmapProcState_matrix.h"
 #endif
 
 #define MAKENAME(suffix)        GeneralXY ## suffix
@@ -228,52 +233,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
 {
     int i;
 
-#if	defined(__ARM_HAVE_NEON)
-    if (count >= 8) {
-        /* SkFixed is 16.16 fixed point */
-        SkFixed dx2 = dx+dx;
-        SkFixed dx4 = dx2+dx2;
-        SkFixed dx8 = dx4+dx4;
-
-        /* now build fx/fx+dx/fx+2dx/fx+3dx */
-        SkFixed fx1, fx2, fx3;
-        int32x2_t lower, upper;
-        int32x4_t lbase, hbase;
-        uint16_t *dst16 = (uint16_t *)dst;
-
-        fx1 = fx+dx;
-        fx2 = fx1+dx;
-        fx3 = fx2+dx;
-
-        /* avoid an 'lbase unitialized' warning */
-        lbase = vdupq_n_s32(fx);
-        lbase = vsetq_lane_s32(fx1, lbase, 1);
-        lbase = vsetq_lane_s32(fx2, lbase, 2);
-        lbase = vsetq_lane_s32(fx3, lbase, 3);
-        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
-
-        /* take upper 16 of each, store, and bump everything */
-        do {
-            int32x4_t lout, hout;
-            uint16x8_t hi16;
-
-            lout = lbase;
-            hout = hbase;
-            /* gets hi's of all louts then hi's of all houts */
-            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
-            hi16 = vreinterpretq_u16_s32(hout);
-            vst1q_u16(dst16, hi16);
-
-            /* on to the next */
-            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
-            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
-            dst16 += 8;
-            count -= 8;
-            fx += dx8;
-        } while (count >= 8);
-        dst = (uint32_t *) dst16;
-    }
-#else
     for (i = (count >> 2); i > 0; --i)
     {
         *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
@@ -282,7 +241,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
         fx += dx+dx;
     }
     count &= 3;
-#endif
 
     uint16_t* xx = (uint16_t*)dst;
     for (i = count; i > 0; --i) {
@@ -293,42 +251,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
 void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
 {
 
-#if	defined(__ARM_HAVE_NEON)
-    if (count >= 8) {
-        int32x4_t wide_fx;
-        int32x4_t wide_fx2;
-        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
-
-        wide_fx = vdupq_n_s32(fx);
-        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
-        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
-        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
-
-        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
-
-        while (count >= 8) {
-            int32x4_t wide_out;
-            int32x4_t wide_out2;
-
-            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
-            wide_out = vorrq_s32(wide_out,
-            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
-
-            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
-            wide_out2 = vorrq_s32(wide_out2,
-            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
-
-            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
-            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
-
-            dst += 8;
-            fx += dx*8;
-            wide_fx = vaddq_s32(wide_fx, wide_dx8);
-            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
-            count -= 8;
-        }
-    }
-#endif
 
     if (count & 1)
     {
@@ -574,7 +496,17 @@ SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
         // clamp gets special version of filterOne
         fFilterOneX = SK_Fixed1;
         fFilterOneY = SK_Fixed1;
+#if SK_ARM_NEON_IS_NONE
         return ClampX_ClampY_Procs[index];
+#elif SK_ARM_NEON_IS_ALWAYS
+        return ClampX_ClampY_Procs_neon[index];
+#else // SK_ARM_NEON_IS_DYNAMIC
+        if (sk_cpu_arm_has_neon()) {
+            return ClampX_ClampY_Procs_neon[index];
+        } else {
+            return ClampX_ClampY_Procs[index];
+        }
+#endif
     }
     
     // all remaining procs use this form for filterOne
@@ -584,9 +516,19 @@ SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
     if (SkShader::kRepeat_TileMode == fTileModeX &&
         SkShader::kRepeat_TileMode == fTileModeY)
     {
+#if SK_ARM_NEON_IS_NONE
         return RepeatX_RepeatY_Procs[index];
+#elif SK_ARM_NEON_IS_ALWAYS
+        return RepeatX_RepeatY_Procs_neon[index];
+#else // SK_ARM_NEON_IS_DYNAMIC
+        if (sk_cpu_arm_has_neon()) {
+            return RepeatX_RepeatY_Procs_neon[index];
+        } else {
+            return RepeatX_RepeatY_Procs[index];
+        }
+#endif
     }
-    
+
     fTileProcX = choose_tile_proc(fTileModeX);
     fTileProcY = choose_tile_proc(fTileModeY);
     fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX);
author	digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	2012-08-01 14:25:07 +0000
committer	digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	2012-08-01 14:25:07 +0000
commit	fce02aca62525c3041226501574f740f7ea3714b (patch)
tree	9053a19f5850210917e47ef54c778ccd5cae686d /src/core/SkBitmapProcState_matrixProcs.cpp
parent	47de6787a5aa677157fd468c5798eeb01c6c9139 (diff)