diff options
author | digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-08-01 14:25:07 +0000 |
---|---|---|
committer | digit@google.com <digit@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-08-01 14:25:07 +0000 |
commit | fce02aca62525c3041226501574f740f7ea3714b (patch) | |
tree | 9053a19f5850210917e47ef54c778ccd5cae686d /src/core/SkBitmapProcState_matrixProcs.cpp | |
parent | 47de6787a5aa677157fd468c5798eeb01c6c9139 (diff) |
arm: dynamic NEON support for SkBitmapProcState matrix operations.
This patch implements dynamic ARM NEON support for the functions
implemented by src/core/SkBitmapProcState_matrixProcs.cpp.
- Because the SkBitmapProcState_matrix_{clamp,repeat}.h headers
are NEON-specific, they are renamed with a _neon.h suffix, and
moved to src/opts/ (from src/core/)
- Add a new file src/opts/SkBitmapProcState_matrixProcs_neon.cpp
which implements the NEON code paths for all builds, and add
it to the 'opts_neon' static library.
- Modify SkBitmapProcState_matrixProcs.cpp to select the right
code-path depending on our build configuration. Note that in
the case where 'arm_neon == 1', we do not embed regular ARM
code paths in the final binary. Only 'arm_neon_optional == 1'
builds will contain both regular and NEON code paths at the
same time.
Note that there doesn't seem to be a simple way to put the
NEON-specific selection from that currently is in
SkBitmapProcState_matrixProcs.cpp into src/opts/. Doing so
would require much more drastic restructuring. This is also
true of the other SkBitmapProcState source files that will
be touched in a future patch.
Review URL: https://codereview.appspot.com/6453065
git-svn-id: http://skia.googlecode.com/svn/trunk@4888 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/core/SkBitmapProcState_matrixProcs.cpp')
-rw-r--r-- | src/core/SkBitmapProcState_matrixProcs.cpp | 128 |
1 files changed, 35 insertions, 93 deletions
diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp index 1e12f9a921..77c6200d03 100644 --- a/src/core/SkBitmapProcState_matrixProcs.cpp +++ b/src/core/SkBitmapProcState_matrixProcs.cpp @@ -8,6 +8,7 @@ #include "SkPerspIter.h" #include "SkShader.h" #include "SkUtils.h" +#include "SkUtilsArm.h" // Helper to ensure that when we shift down, we do it w/o sign-extension // so the caller doesn't have to manually mask off the top 16 bits @@ -67,27 +68,31 @@ static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX, void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); +// Compile neon code paths if needed +#if !SK_ARM_NEON_IS_NONE + +// These are defined in src/opts/SkBitmapProcState_matrixProcs_neon.cpp +extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; +extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; + +#endif // !SK_ARM_NEON_IS_NONE + +// Compile non-neon code path if needed +#if !SK_ARM_NEON_IS_ALWAYS #define MAKENAME(suffix) ClampX_ClampY ## suffix #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) #define CHECK_FOR_DECAL -#if defined(__ARM_HAVE_NEON) - #include "SkBitmapProcState_matrix_clamp.h" -#else - #include "SkBitmapProcState_matrix.h" -#endif +#include "SkBitmapProcState_matrix.h" #define MAKENAME(suffix) RepeatX_RepeatY ## suffix #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) -#if defined(__ARM_HAVE_NEON) - #include "SkBitmapProcState_matrix_repeat.h" -#else - #include "SkBitmapProcState_matrix.h" +#include "SkBitmapProcState_matrix.h" #endif #define MAKENAME(suffix) GeneralXY ## suffix @@ -228,52 +233,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { int i; -#if defined(__ARM_HAVE_NEON) - if (count >= 8) { - /* SkFixed is 16.16 fixed point */ - SkFixed dx2 = dx+dx; - SkFixed dx4 = dx2+dx2; - SkFixed dx8 = dx4+dx4; - - /* now build fx/fx+dx/fx+2dx/fx+3dx */ - SkFixed fx1, fx2, fx3; - int32x2_t lower, upper; - int32x4_t lbase, hbase; - uint16_t *dst16 = (uint16_t *)dst; - - fx1 = fx+dx; - fx2 = fx1+dx; - fx3 = fx2+dx; - - /* avoid an 'lbase unitialized' warning */ - lbase = vdupq_n_s32(fx); - lbase = vsetq_lane_s32(fx1, lbase, 1); - lbase = vsetq_lane_s32(fx2, lbase, 2); - lbase = vsetq_lane_s32(fx3, lbase, 3); - hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); - - /* take upper 16 of each, store, and bump everything */ - do { - int32x4_t lout, hout; - uint16x8_t hi16; - - lout = lbase; - hout = hbase; - /* gets hi's of all louts then hi's of all houts */ - asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); - hi16 = vreinterpretq_u16_s32(hout); - vst1q_u16(dst16, hi16); - - /* on to the next */ - lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); - hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); - dst16 += 8; - count -= 8; - fx += dx8; - } while (count >= 8); - dst = (uint32_t *) dst16; - } -#else for (i = (count >> 2); i > 0; --i) { *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); @@ -282,7 +241,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) fx += dx+dx; } count &= 3; -#endif uint16_t* xx = (uint16_t*)dst; for (i = count; i > 0; --i) { @@ -293,42 +251,6 @@ void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { -#if defined(__ARM_HAVE_NEON) - if (count >= 8) { - int32x4_t wide_fx; - int32x4_t wide_fx2; - int32x4_t wide_dx8 = vdupq_n_s32(dx*8); - - wide_fx = vdupq_n_s32(fx); - wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); - wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); - wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); - - wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); - - while (count >= 8) { - int32x4_t wide_out; - int32x4_t wide_out2; - - wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); - wide_out = vorrq_s32(wide_out, - vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); - - wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); - wide_out2 = vorrq_s32(wide_out2, - vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); - - vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); - vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); - - dst += 8; - fx += dx*8; - wide_fx = vaddq_s32(wide_fx, wide_dx8); - wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); - count -= 8; - } - } -#endif if (count & 1) { @@ -574,7 +496,17 @@ SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { // clamp gets special version of filterOne fFilterOneX = SK_Fixed1; fFilterOneY = SK_Fixed1; +#if SK_ARM_NEON_IS_NONE return ClampX_ClampY_Procs[index]; +#elif SK_ARM_NEON_IS_ALWAYS + return ClampX_ClampY_Procs_neon[index]; +#else // SK_ARM_NEON_IS_DYNAMIC + if (sk_cpu_arm_has_neon()) { + return ClampX_ClampY_Procs_neon[index]; + } else { + return ClampX_ClampY_Procs[index]; + } +#endif } // all remaining procs use this form for filterOne @@ -584,9 +516,19 @@ SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { if (SkShader::kRepeat_TileMode == fTileModeX && SkShader::kRepeat_TileMode == fTileModeY) { +#if SK_ARM_NEON_IS_NONE return RepeatX_RepeatY_Procs[index]; +#elif SK_ARM_NEON_IS_ALWAYS + return RepeatX_RepeatY_Procs_neon[index]; +#else // SK_ARM_NEON_IS_DYNAMIC + if (sk_cpu_arm_has_neon()) { + return RepeatX_RepeatY_Procs_neon[index]; + } else { + return RepeatX_RepeatY_Procs[index]; + } +#endif } - + fTileProcX = choose_tile_proc(fTileModeX); fTileProcY = choose_tile_proc(fTileModeY); fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); |