/* NEON optimized code (C) COPYRIGHT 2009 Motorola * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "SkBitmapProcState.h" #include "SkPerspIter.h" #include "SkShader.h" #include "SkUtils.h" #include "SkUtilsArm.h" // Helper to ensure that when we shift down, we do it w/o sign-extension // so the caller doesn't have to manually mask off the top 16 bits // static unsigned SK_USHIFT16(unsigned x) { return x >> 16; } /* returns 0...(n-1) given any x (positive or negative). As an example, if n (which is always positive) is 5... x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 */ static inline int sk_int_mod(int x, int n) { SkASSERT(n > 0); if ((unsigned)x >= (unsigned)n) { if (x < 0) { x = n + ~(~x % n); } else { x = x % n; } } return x; } /* * The decal_ functions require that * 1. dx > 0 * 2. [fx, fx+dx, fx+2dx, fx+3dx, ... fx+(count-1)dx] are all <= maxX * * In addition, we use SkFractionalInt to keep more fractional precision than * just SkFixed, so we will abort the decal_ call if dx is very small, since * the decal_ function just operates on SkFixed. If that were changed, we could * skip the very_small test here. */ static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX, SkFractionalInt frDx, int count, unsigned max) { SkFixed dx = SkFractionalIntToFixed(frDx); // if decal_ kept SkFractionalInt precision, this would just be dx <= 0 // I just made up the 1/256. Just don't want to perceive accumulated error // if we truncate frDx and lose its low bits. if (dx <= SK_Fixed1 / 256) { return false; } // We cast to unsigned so we don't have to check for negative values, which // will now appear as very large positive values, and thus fail our test! SkFixed fx = SkFractionalIntToFixed(frX); return (unsigned)SkFixedFloorToInt(fx) <= max && (unsigned)SkFixedFloorToInt(fx + dx * (count - 1)) < max; } void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); // Compile neon code paths if needed #if !SK_ARM_NEON_IS_NONE // These are defined in src/opts/SkBitmapProcState_matrixProcs_neon.cpp extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; #endif // !SK_ARM_NEON_IS_NONE // Compile non-neon code path if needed #if !SK_ARM_NEON_IS_ALWAYS #define MAKENAME(suffix) ClampX_ClampY ## suffix #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) #define CHECK_FOR_DECAL #include "SkBitmapProcState_matrix.h" #define MAKENAME(suffix) RepeatX_RepeatY ## suffix #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) #include "SkBitmapProcState_matrix.h" #endif #define MAKENAME(suffix) GeneralXY ## suffix #define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; (void) tileProcX; \ SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; (void) tileProcY; \ SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; (void) tileLowBitsProcX; \ SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY; (void) tileLowBitsProcY #define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX #define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY #define PREAMBLE_ARG_X , tileProcX, tileLowBitsProcX #define PREAMBLE_ARG_Y , tileProcY, tileLowBitsProcY #define TILEX_PROCF(fx, max) SK_USHIFT16(tileProcX(fx) * ((max) + 1)) #define TILEY_PROCF(fy, max) SK_USHIFT16(tileProcY(fy) * ((max) + 1)) #define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1) #define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1) #include "SkBitmapProcState_matrix.h" static inline U16CPU fixed_clamp(SkFixed x) { #ifdef SK_CPU_HAS_CONDITIONAL_INSTR if (x < 0) x = 0; if (x >> 16) x = 0xFFFF; #else if (x >> 16) { #if 0 // is this faster? x = (~x >> 31) & 0xFFFF; #else if (x < 0) x = 0; else x = 0xFFFF; #endif } #endif return x; } static inline U16CPU fixed_repeat(SkFixed x) { return x & 0xFFFF; } // Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly. // See http://code.google.com/p/skia/issues/detail?id=472 #if defined(_MSC_VER) && (_MSC_VER >= 1600) #pragma optimize("", off) #endif static inline U16CPU fixed_mirror(SkFixed x) { SkFixed s = x << 15 >> 31; // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval return (x ^ s) & 0xFFFF; } #if defined(_MSC_VER) && (_MSC_VER >= 1600) #pragma optimize("", on) #endif static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) { if (SkShader::kClamp_TileMode == m) return fixed_clamp; if (SkShader::kRepeat_TileMode == m) return fixed_repeat; SkASSERT(SkShader::kMirror_TileMode == m); return fixed_mirror; } static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) { return (x >> 12) & 0xF; } static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) { return ((x * scale) >> 12) & 0xF; } static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) { if (SkShader::kClamp_TileMode == m) { return fixed_clamp_lowbits; } else { SkASSERT(SkShader::kMirror_TileMode == m || SkShader::kRepeat_TileMode == m); // mirror and repeat have the same behavior for the low bits. return fixed_repeat_or_mirrow_lowbits; } } static inline U16CPU int_clamp(int x, int n) { #ifdef SK_CPU_HAS_CONDITIONAL_INSTR if (x >= n) x = n - 1; if (x < 0) x = 0; #else if ((unsigned)x >= (unsigned)n) { if (x < 0) { x = 0; } else { x = n - 1; } } #endif return x; } static inline U16CPU int_repeat(int x, int n) { return sk_int_mod(x, n); } static inline U16CPU int_mirror(int x, int n) { x = sk_int_mod(x, 2 * n); if (x >= n) { x = n + ~(x - n); } return x; } #if 0 static void test_int_tileprocs() { for (int i = -8; i <= 8; i++) { SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); } } #endif static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { if (SkShader::kClamp_TileMode == tm) return int_clamp; if (SkShader::kRepeat_TileMode == tm) return int_repeat; SkASSERT(SkShader::kMirror_TileMode == tm); return int_mirror; } ////////////////////////////////////////////////////////////////////////////// void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { int i; for (i = (count >> 2); i > 0; --i) { *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); fx += dx+dx; *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); fx += dx+dx; } count &= 3; uint16_t* xx = (uint16_t*)dst; for (i = count; i > 0; --i) { *xx++ = SkToU16(fx >> 16); fx += dx; } } void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { if (count & 1) { SkASSERT((fx >> (16 + 14)) == 0); *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); fx += dx; } while ((count -= 2) >= 0) { SkASSERT((fx >> (16 + 14)) == 0); *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); fx += dx; *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); fx += dx; } } /////////////////////////////////////////////////////////////////////////////// // stores the same as SCALE, but is cheaper to compute. Also since there is no // scale, we don't need/have a FILTER version static void fill_sequential(uint16_t xptr[], int start, int count) { #if 1 if (reinterpret_cast(xptr) & 0x2) { *xptr++ = start++; count -= 1; } if (count > 3) { uint32_t* xxptr = reinterpret_cast(xptr); uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); start += count & ~3; int qcount = count >> 2; do { *xxptr++ = pattern0; pattern0 += 0x40004; *xxptr++ = pattern1; pattern1 += 0x40004; } while (--qcount != 0); xptr = reinterpret_cast(xxptr); count &= 3; } while (--count >= 0) { *xptr++ = start++; } #else for (int i = 0; i < count; i++) { *xptr++ = start++; } #endif } static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, int x, int y) { SkPoint pt; s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, &pt); **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, s.fBitmap->height()); *xy += 1; // bump the ptr // return our starting X position return SkScalarToFixed(pt.fX) >> 16; } static void clampx_nofilter_trans(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); int xpos = nofilter_trans_preamble(s, &xy, x, y); const int width = s.fBitmap->width(); if (1 == width) { // all of the following X values must be 0 memset(xy, 0, count * sizeof(uint16_t)); return; } uint16_t* xptr = reinterpret_cast(xy); int n; // fill before 0 as needed if (xpos < 0) { n = -xpos; if (n > count) { n = count; } memset(xptr, 0, n * sizeof(uint16_t)); count -= n; if (0 == count) { return; } xptr += n; xpos = 0; } // fill in 0..width-1 if needed if (xpos < width) { n = width - xpos; if (n > count) { n = count; } fill_sequential(xptr, xpos, n); count -= n; if (0 == count) { return; } xptr += n; } // fill the remaining with the max value sk_memset16(xptr, width - 1, count); } static void repeatx_nofilter_trans(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); int xpos = nofilter_trans_preamble(s, &xy, x, y); const int width = s.fBitmap->width(); if (1 == width) { // all of the following X values must be 0 memset(xy, 0, count * sizeof(uint16_t)); return; } uint16_t* xptr = reinterpret_cast(xy); int start = sk_int_mod(xpos, width); int n = width - start; if (n > count) { n = count; } fill_sequential(xptr, start, n); xptr += n; count -= n; while (count >= width) { fill_sequential(xptr, 0, width); xptr += width; count -= width; } if (count > 0) { fill_sequential(xptr, 0, count); } } static void fill_backwards(uint16_t xptr[], int pos, int count) { for (int i = 0; i < count; i++) { SkASSERT(pos >= 0); xptr[i] = pos--; } } static void mirrorx_nofilter_trans(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); int xpos = nofilter_trans_preamble(s, &xy, x, y); const int width = s.fBitmap->width(); if (1 == width) { // all of the following X values must be 0 memset(xy, 0, count * sizeof(uint16_t)); return; } uint16_t* xptr = reinterpret_cast(xy); // need to know our start, and our initial phase (forward or backward) bool forward; int n; int start = sk_int_mod(xpos, 2 * width); if (start >= width) { start = width + ~(start - width); forward = false; n = start + 1; // [start .. 0] } else { forward = true; n = width - start; // [start .. width) } if (n > count) { n = count; } if (forward) { fill_sequential(xptr, start, n); } else { fill_backwards(xptr, start, n); } forward = !forward; xptr += n; count -= n; while (count >= width) { if (forward) { fill_sequential(xptr, 0, width); } else { fill_backwards(xptr, width - 1, width); } forward = !forward; xptr += width; count -= width; } if (count > 0) { if (forward) { fill_sequential(xptr, 0, count); } else { fill_backwards(xptr, width - 1, count); } } } /////////////////////////////////////////////////////////////////////////////// SkBitmapProcState::MatrixProc SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { // test_int_tileprocs(); // check for our special case when there is no scale/affine/perspective if (trivial_matrix) { SkASSERT(!fDoFilter); fIntTileProcY = choose_int_tile_proc(fTileModeY); switch (fTileModeX) { case SkShader::kClamp_TileMode: return clampx_nofilter_trans; case SkShader::kRepeat_TileMode: return repeatx_nofilter_trans; case SkShader::kMirror_TileMode: return mirrorx_nofilter_trans; } } int index = 0; if (fDoFilter) { index = 1; } if (fInvType & SkMatrix::kPerspective_Mask) { index += 4; } else if (fInvType & SkMatrix::kAffine_Mask) { index += 2; } if (SkShader::kClamp_TileMode == fTileModeX && SkShader::kClamp_TileMode == fTileModeY) { // clamp gets special version of filterOne fFilterOneX = SK_Fixed1; fFilterOneY = SK_Fixed1; return SK_ARM_NEON_WRAP(ClampX_ClampY_Procs)[index]; } // all remaining procs use this form for filterOne fFilterOneX = SK_Fixed1 / fBitmap->width(); fFilterOneY = SK_Fixed1 / fBitmap->height(); if (SkShader::kRepeat_TileMode == fTileModeX && SkShader::kRepeat_TileMode == fTileModeY) { return SK_ARM_NEON_WRAP(RepeatX_RepeatY_Procs)[index]; } fTileProcX = choose_tile_proc(fTileModeX); fTileProcY = choose_tile_proc(fTileModeY); fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY); return GeneralXY_Procs[index]; }