diff options
author | xiangze.zhang <xiangze.zhang@intel.com> | 2016-11-17 18:39:38 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-11-17 18:39:38 -0800 |
commit | d2265e537c8015f8115d7b5b7f6de970aa688172 (patch) | |
tree | b58af83005d4b3754eefe5dfe33f3c321f807848 | |
parent | f8afab96776f15c59ae0d854997216e5f1a3b332 (diff) |
Port convolve functions to SkOpts
This patch moves the C++/SSE2/NEON implementations of convolve functions
into the same place and uses SkOpts framework.
Also some indentation fix.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2500113004
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2500113004
-rw-r--r-- | gn/opts.gni | 5 | ||||
-rw-r--r-- | src/core/SkBitmapScaler.cpp | 173 | ||||
-rw-r--r-- | src/core/SkBitmapScaler.h | 6 | ||||
-rw-r--r-- | src/core/SkConvolver.cpp | 219 | ||||
-rw-r--r-- | src/core/SkConvolver.h | 30 | ||||
-rw-r--r-- | src/core/SkOpts.cpp | 6 | ||||
-rw-r--r-- | src/core/SkOpts.h | 10 | ||||
-rw-r--r-- | src/opts/SkBitmapFilter_opts.h | 940 | ||||
-rw-r--r-- | src/opts/SkBitmapFilter_opts_SSE2.cpp | 450 | ||||
-rw-r--r-- | src/opts/SkBitmapFilter_opts_SSE2.h | 30 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_arm_neon.cpp | 379 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 30 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_mips_dsp.cpp | 2 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_none.cpp | 3 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 11 |
15 files changed, 1054 insertions, 1240 deletions
diff --git a/gn/opts.gni b/gn/opts.gni index 2332de3143..02a95cbad5 100644 --- a/gn/opts.gni +++ b/gn/opts.gni @@ -13,7 +13,7 @@ none = [ ] armv7 = [ - "$_src/opts/SkBitmapProcState_opts_arm.cpp", + "$_src/opts/SkBitmapProcState_opts_none.cpp", "$_src/opts/SkBlitMask_opts_arm.cpp", "$_src/opts/SkBlitRow_opts_arm.cpp", ] @@ -28,7 +28,7 @@ neon = [ arm64 = [ "$_src/opts/SkBitmapProcState_arm_neon.cpp", "$_src/opts/SkBitmapProcState_matrixProcs_neon.cpp", - "$_src/opts/SkBitmapProcState_opts_arm.cpp", + "$_src/opts/SkBitmapProcState_opts_none.cpp", "$_src/opts/SkBlitMask_opts_arm.cpp", "$_src/opts/SkBlitMask_opts_arm_neon.cpp", "$_src/opts/SkBlitRow_opts_arm.cpp", @@ -44,7 +44,6 @@ mips_dsp = [ ] sse2 = [ - "$_src/opts/SkBitmapFilter_opts_SSE2.cpp", "$_src/opts/SkBitmapProcState_opts_SSE2.cpp", "$_src/opts/SkBlitRow_opts_SSE2.cpp", "$_src/opts/opts_check_x86.cpp", diff --git a/src/core/SkBitmapScaler.cpp b/src/core/SkBitmapScaler.cpp index 25fbd1fe78..b4ade85a75 100644 --- a/src/core/SkBitmapScaler.cpp +++ b/src/core/SkBitmapScaler.cpp @@ -109,93 +109,92 @@ SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method, // the coefficients can be shared. For periods of 1 we can consider // loading the factors only once outside the borders. void SkResizeFilter::computeFilters(int srcSize, - float destSubsetLo, float destSubsetSize, - float scale, - SkConvolutionFilter1D* output) { - float destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi) - - // When we're doing a magnification, the scale will be larger than one. This - // means the destination pixels are much smaller than the source pixels, and - // that the range covered by the filter won't necessarily cover any source - // pixel boundaries. Therefore, we use these clamped values (max of 1) for - // some computations. - float clampedScale = SkTMin(1.0f, scale); - - // This is how many source pixels from the center we need to count - // to support the filtering function. - float srcSupport = fBitmapFilter->width() / clampedScale; - - float invScale = 1.0f / scale; - - SkSTArray<64, float, true> filterValuesArray; - SkSTArray<64, SkConvolutionFilter1D::ConvolutionFixed, true> fixedFilterValuesArray; - - // Loop over all pixels in the output range. We will generate one set of - // filter values for each one. Those values will tell us how to blend the - // source pixels to compute the destination pixel. - - // This is the pixel in the source directly under the pixel in the dest. - // Note that we base computations on the "center" of the pixels. To see - // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x - // downscale should "cover" the pixels around the pixel with *its center* - // at coordinates (2.5, 2.5) in the source, not those around (0, 0). - // Hence we need to scale coordinates (0.5, 0.5), not (0, 0). - destSubsetLo = SkScalarFloorToScalar(destSubsetLo); - destSubsetHi = SkScalarCeilToScalar(destSubsetHi); - float srcPixel = (destSubsetLo + 0.5f) * invScale; - int destLimit = SkScalarTruncToInt(destSubsetHi - destSubsetLo); - output->reserveAdditional(destLimit, SkScalarCeilToInt(destLimit * srcSupport * 2)); - for (int destI = 0; destI < destLimit; srcPixel += invScale, destI++) - { - // Compute the (inclusive) range of source pixels the filter covers. - float srcBegin = SkTMax(0.f, SkScalarFloorToScalar(srcPixel - srcSupport)); - float srcEnd = SkTMin(srcSize - 1.f, SkScalarCeilToScalar(srcPixel + srcSupport)); - - // Compute the unnormalized filter value at each location of the source - // it covers. - - // Sum of the filter values for normalizing. - // Distance from the center of the filter, this is the filter coordinate - // in source space. We also need to consider the center of the pixel - // when comparing distance against 'srcPixel'. In the 5x downscale - // example used above the distance from the center of the filter to - // the pixel with coordinates (2, 2) should be 0, because its center - // is at (2.5, 2.5). - float destFilterDist = (srcBegin + 0.5f - srcPixel) * clampedScale; - int filterCount = SkScalarTruncToInt(srcEnd - srcBegin) + 1; - if (filterCount <= 0) { - // true when srcSize is equal to srcPixel - srcSupport; this may be a bug - return; - } - filterValuesArray.reset(filterCount); - float filterSum = fBitmapFilter->evaluate_n(destFilterDist, clampedScale, filterCount, + float destSubsetLo, float destSubsetSize, + float scale, + SkConvolutionFilter1D* output) { + float destSubsetHi = destSubsetLo + destSubsetSize; // [lo, hi) + + // When we're doing a magnification, the scale will be larger than one. This + // means the destination pixels are much smaller than the source pixels, and + // that the range covered by the filter won't necessarily cover any source + // pixel boundaries. Therefore, we use these clamped values (max of 1) for + // some computations. + float clampedScale = SkTMin(1.0f, scale); + + // This is how many source pixels from the center we need to count + // to support the filtering function. + float srcSupport = fBitmapFilter->width() / clampedScale; + + float invScale = 1.0f / scale; + + SkSTArray<64, float, true> filterValuesArray; + SkSTArray<64, SkConvolutionFilter1D::ConvolutionFixed, true> fixedFilterValuesArray; + + // Loop over all pixels in the output range. We will generate one set of + // filter values for each one. Those values will tell us how to blend the + // source pixels to compute the destination pixel. + + // This is the pixel in the source directly under the pixel in the dest. + // Note that we base computations on the "center" of the pixels. To see + // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x + // downscale should "cover" the pixels around the pixel with *its center* + // at coordinates (2.5, 2.5) in the source, not those around (0, 0). + // Hence we need to scale coordinates (0.5, 0.5), not (0, 0). + destSubsetLo = SkScalarFloorToScalar(destSubsetLo); + destSubsetHi = SkScalarCeilToScalar(destSubsetHi); + float srcPixel = (destSubsetLo + 0.5f) * invScale; + int destLimit = SkScalarTruncToInt(destSubsetHi - destSubsetLo); + output->reserveAdditional(destLimit, SkScalarCeilToInt(destLimit * srcSupport * 2)); + for (int destI = 0; destI < destLimit; srcPixel += invScale, destI++) { + // Compute the (inclusive) range of source pixels the filter covers. + float srcBegin = SkTMax(0.f, SkScalarFloorToScalar(srcPixel - srcSupport)); + float srcEnd = SkTMin(srcSize - 1.f, SkScalarCeilToScalar(srcPixel + srcSupport)); + + // Compute the unnormalized filter value at each location of the source + // it covers. + + // Sum of the filter values for normalizing. + // Distance from the center of the filter, this is the filter coordinate + // in source space. We also need to consider the center of the pixel + // when comparing distance against 'srcPixel'. In the 5x downscale + // example used above the distance from the center of the filter to + // the pixel with coordinates (2, 2) should be 0, because its center + // is at (2.5, 2.5). + float destFilterDist = (srcBegin + 0.5f - srcPixel) * clampedScale; + int filterCount = SkScalarTruncToInt(srcEnd - srcBegin) + 1; + if (filterCount <= 0) { + // true when srcSize is equal to srcPixel - srcSupport; this may be a bug + return; + } + filterValuesArray.reset(filterCount); + float filterSum = fBitmapFilter->evaluate_n(destFilterDist, clampedScale, filterCount, filterValuesArray.begin()); - // The filter must be normalized so that we don't affect the brightness of - // the image. Convert to normalized fixed point. - int fixedSum = 0; - fixedFilterValuesArray.reset(filterCount); - const float* filterValues = filterValuesArray.begin(); - SkConvolutionFilter1D::ConvolutionFixed* fixedFilterValues = fixedFilterValuesArray.begin(); - float invFilterSum = 1 / filterSum; - for (int fixedI = 0; fixedI < filterCount; fixedI++) { - int curFixed = SkConvolutionFilter1D::FloatToFixed(filterValues[fixedI] * invFilterSum); - fixedSum += curFixed; - fixedFilterValues[fixedI] = SkToS16(curFixed); + // The filter must be normalized so that we don't affect the brightness of + // the image. Convert to normalized fixed point. + int fixedSum = 0; + fixedFilterValuesArray.reset(filterCount); + const float* filterValues = filterValuesArray.begin(); + SkConvolutionFilter1D::ConvolutionFixed* fixedFilterValues = fixedFilterValuesArray.begin(); + float invFilterSum = 1 / filterSum; + for (int fixedI = 0; fixedI < filterCount; fixedI++) { + int curFixed = SkConvolutionFilter1D::FloatToFixed(filterValues[fixedI] * invFilterSum); + fixedSum += curFixed; + fixedFilterValues[fixedI] = SkToS16(curFixed); + } + SkASSERT(fixedSum <= 0x7FFF); + + // The conversion to fixed point will leave some rounding errors, which + // we add back in to avoid affecting the brightness of the image. We + // arbitrarily add this to the center of the filter array (this won't always + // be the center of the filter function since it could get clipped on the + // edges, but it doesn't matter enough to worry about that case). + int leftovers = SkConvolutionFilter1D::FloatToFixed(1) - fixedSum; + fixedFilterValues[filterCount / 2] += leftovers; + + // Now it's ready to go. + output->AddFilter(SkScalarFloorToInt(srcBegin), fixedFilterValues, filterCount); } - SkASSERT(fixedSum <= 0x7FFF); - - // The conversion to fixed point will leave some rounding errors, which - // we add back in to avoid affecting the brightness of the image. We - // arbitrarily add this to the center of the filter array (this won't always - // be the center of the filter function since it could get clipped on the - // edges, but it doesn't matter enough to worry about that case). - int leftovers = SkConvolutionFilter1D::FloatToFixed(1) - fixedSum; - fixedFilterValues[filterCount / 2] += leftovers; - - // Now it's ready to go. - output->AddFilter(SkScalarFloorToInt(srcBegin), fixedFilterValues, filterCount); - } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -214,9 +213,6 @@ bool SkBitmapScaler::Resize(const SkPixmap& result, const SkPixmap& source, Resi return false; } - SkConvolutionProcs convolveProcs= { nullptr, nullptr, nullptr }; - PlatformConvolutionProcs(&convolveProcs); - SkRect destSubset = SkRect::MakeIWH(result.width(), result.height()); SkResizeFilter filter(method, source.width(), source.height(), @@ -230,8 +226,7 @@ bool SkBitmapScaler::Resize(const SkPixmap& result, const SkPixmap& source, Resi return BGRAConvolve2D(sourceSubset, static_cast<int>(source.rowBytes()), !source.isOpaque(), filter.xFilter(), filter.yFilter(), static_cast<int>(result.rowBytes()), - static_cast<unsigned char*>(result.writable_addr()), - convolveProcs, true); + static_cast<unsigned char*>(result.writable_addr())); } bool SkBitmapScaler::Resize(SkBitmap* resultPtr, const SkPixmap& source, ResizeMethod method, diff --git a/src/core/SkBitmapScaler.h b/src/core/SkBitmapScaler.h index 3d734d6efc..c96be0dbf8 100644 --- a/src/core/SkBitmapScaler.h +++ b/src/core/SkBitmapScaler.h @@ -41,12 +41,6 @@ public: */ static bool Resize(SkBitmap* result, const SkPixmap& src, ResizeMethod method, int dest_width, int dest_height, SkBitmap::Allocator* = nullptr); - - /** Platforms can also optionally overwrite the convolution functions - if we have SIMD versions of them. - */ - - static void PlatformConvolutionProcs(SkConvolutionProcs*); }; #endif diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp index c32cc03472..81fa9ee616 100644 --- a/src/core/SkConvolver.cpp +++ b/src/core/SkConvolver.cpp @@ -3,22 +3,10 @@ // found in the LICENSE file. #include "SkConvolver.h" +#include "SkOpts.h" #include "SkTArray.h" namespace { - - // Converts the argument to an 8-bit unsigned value by clamping to the range - // 0-255. - inline unsigned char ClampTo8(int a) { - if (static_cast<unsigned>(a) < 256) { - return a; // Avoid the extra check in the common case. - } - if (a < 0) { - return 0; - } - return 255; - } - // Stores a list of rows in a circular buffer. The usage is you write into it // by calling AdvanceRow. It will keep track of which row in the buffer it // should use next, and the total number of rows added. @@ -108,169 +96,6 @@ namespace { SkTArray<unsigned char*> fRowAddresses; }; -// Convolves horizontally along a single row. The row data is given in -// |srcData| and continues for the numValues() of the filter. -template<bool hasAlpha> - void ConvolveHorizontally(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow) { - // Loop over each pixel on this row in the output image. - int numValues = filter.numValues(); - for (int outX = 0; outX < numValues; outX++) { - // Get the filter that determines the current output pixel. - int filterOffset, filterLength; - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filterLength| pixels (4 bytes each) after this. - const unsigned char* rowToFilter = &srcData[filterOffset * 4]; - - // Apply the filter to the row to get the destination pixel in |accum|. - int accum[4] = {0}; - for (int filterX = 0; filterX < filterLength; filterX++) { - SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; - accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; - accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; - accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; - if (hasAlpha) { - accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; - } - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of fractional part. - accum[0] >>= SkConvolutionFilter1D::kShiftBits; - accum[1] >>= SkConvolutionFilter1D::kShiftBits; - accum[2] >>= SkConvolutionFilter1D::kShiftBits; - if (hasAlpha) { - accum[3] >>= SkConvolutionFilter1D::kShiftBits; - } - - // Store the new pixel. - outRow[outX * 4 + 0] = ClampTo8(accum[0]); - outRow[outX * 4 + 1] = ClampTo8(accum[1]); - outRow[outX * 4 + 2] = ClampTo8(accum[2]); - if (hasAlpha) { - outRow[outX * 4 + 3] = ClampTo8(accum[3]); - } - } - } - - // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize). We originally - // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles - // suffer here too. - // - // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. https://bug.skia.org/2575 - #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) - #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline)) - #else - #define SK_MAYBE_DISABLE_VECTORIZATION - #endif - - SK_MAYBE_DISABLE_VECTORIZATION - static void ConvolveHorizontallyAlpha(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow) { - return ConvolveHorizontally<true>(srcData, filter, outRow); - } - - SK_MAYBE_DISABLE_VECTORIZATION - static void ConvolveHorizontallyNoAlpha(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow) { - return ConvolveHorizontally<false>(srcData, filter, outRow); - } - - #undef SK_MAYBE_DISABLE_VECTORIZATION - - -// Does vertical convolution to produce one output row. The filter values and -// length are given in the first two parameters. These are applied to each -// of the rows pointed to in the |sourceDataRows| array, with each row -// being |pixelWidth| wide. -// -// The output must have room for |pixelWidth * 4| bytes. -template<bool hasAlpha> - void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow) { - // We go through each column in the output and do a vertical convolution, - // generating one output pixel each time. - for (int outX = 0; outX < pixelWidth; outX++) { - // Compute the number of bytes over in each row that the current column - // we're convolving starts at. The pixel will cover the next 4 bytes. - int byteOffset = outX * 4; - - // Apply the filter to one column of pixels. - int accum[4] = {0}; - for (int filterY = 0; filterY < filterLength; filterY++) { - SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; - accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; - accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; - accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; - if (hasAlpha) { - accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; - } - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of precision. - accum[0] >>= SkConvolutionFilter1D::kShiftBits; - accum[1] >>= SkConvolutionFilter1D::kShiftBits; - accum[2] >>= SkConvolutionFilter1D::kShiftBits; - if (hasAlpha) { - accum[3] >>= SkConvolutionFilter1D::kShiftBits; - } - - // Store the new pixel. - outRow[byteOffset + 0] = ClampTo8(accum[0]); - outRow[byteOffset + 1] = ClampTo8(accum[1]); - outRow[byteOffset + 2] = ClampTo8(accum[2]); - if (hasAlpha) { - unsigned char alpha = ClampTo8(accum[3]); - - // Make sure the alpha channel doesn't come out smaller than any of the - // color channels. We use premultipled alpha channels, so this should - // never happen, but rounding errors will cause this from time to time. - // These "impossible" colors will cause overflows (and hence random pixel - // values) when the resulting bitmap is drawn to the screen. - // - // We only need to do this when generating the final output row (here). - int maxColorChannel = SkTMax(outRow[byteOffset + 0], - SkTMax(outRow[byteOffset + 1], - outRow[byteOffset + 2])); - if (alpha < maxColorChannel) { - outRow[byteOffset + 3] = maxColorChannel; - } else { - outRow[byteOffset + 3] = alpha; - } - } else { - // No alpha channel, the image is opaque. - outRow[byteOffset + 3] = 0xff; - } - } - } - - void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow, - bool sourceHasAlpha) { - if (sourceHasAlpha) { - ConvolveVertically<true>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } else { - ConvolveVertically<false>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } - } - } // namespace // SkConvolutionFilter1D --------------------------------------------------------- @@ -346,9 +171,7 @@ bool BGRAConvolve2D(const unsigned char* sourceData, const SkConvolutionFilter1D& filterX, const SkConvolutionFilter1D& filterY, int outputByteRowStride, - unsigned char* output, - const SkConvolutionProcs& convolveProcs, - bool useSimdIfPossible) { + unsigned char* output) { int maxYFilterSize = filterY.maxFilter(); @@ -374,7 +197,7 @@ bool BGRAConvolve2D(const unsigned char* sourceData, // convolution pass yet. Somehow Windows does not like it. int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; int rowBufferHeight = maxYFilterSize + - (convolveProcs.fConvolve4RowsHorizontally ? 4 : 0); + (SkOpts::convolve_4_rows_horizontally != nullptr ? 4 : 0); // check for too-big allocation requests : crbug.com/528628 { @@ -410,7 +233,7 @@ bool BGRAConvolve2D(const unsigned char* sourceData, // Generate output rows until we have enough to run the current filter. while (nextXRow < filterOffset + filterLength) { - if (convolveProcs.fConvolve4RowsHorizontally && + if (SkOpts::convolve_4_rows_horizontally != nullptr && nextXRow + 3 < lastFilterOffset + lastFilterLength) { const unsigned char* src[4]; unsigned char* outRow[4]; @@ -418,24 +241,12 @@ bool BGRAConvolve2D(const unsigned char* sourceData, src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride]; outRow[i] = rowBuffer.advanceRow(); } - convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth); + SkOpts::convolve_4_rows_horizontally(src, filterX, outRow, 4*rowBufferWidth); nextXRow += 4; } else { - if (convolveProcs.fConvolveHorizontally) { - convolveProcs.fConvolveHorizontally( + SkOpts::convolve_horizontally( &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX, rowBuffer.advanceRow(), sourceHasAlpha); - } else { - if (sourceHasAlpha) { - ConvolveHorizontallyAlpha( - &sourceData[(uint64_t)nextXRow * sourceByteRowStride], - filterX, rowBuffer.advanceRow()); - } else { - ConvolveHorizontallyNoAlpha( - &sourceData[(uint64_t)nextXRow * sourceByteRowStride], - filterX, rowBuffer.advanceRow()); - } - } nextXRow++; } } @@ -448,22 +259,14 @@ bool BGRAConvolve2D(const unsigned char* sourceData, unsigned char* const* rowsToConvolve = rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); - // Now compute the start of the subset of those rows that the filter - // needs. + // Now compute the start of the subset of those rows that the filter needs. unsigned char* const* firstRowForFilter = &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; - if (convolveProcs.fConvolveVertically) { - convolveProcs.fConvolveVertically(filterValues, filterLength, - firstRowForFilter, - filterX.numValues(), curOutputRow, - sourceHasAlpha); - } else { - ConvolveVertically(filterValues, filterLength, - firstRowForFilter, - filterX.numValues(), curOutputRow, - sourceHasAlpha); - } + SkOpts::convolve_vertically(filterValues, filterLength, + firstRowForFilter, + filterX.numValues(), curOutputRow, + sourceHasAlpha); } return true; } diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h index 28a08dfa68..4c4b1fd711 100644 --- a/src/core/SkConvolver.h +++ b/src/core/SkConvolver.h @@ -140,32 +140,6 @@ private: int fMaxFilter; }; -typedef void (*SkConvolveVertically_pointer)( - const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow, - bool hasAlpha); -typedef void (*SkConvolve4RowsHorizontally_pointer)( - const unsigned char* srcData[4], - const SkConvolutionFilter1D& filter, - unsigned char* outRow[4], - size_t outRowBytes); -typedef void (*SkConvolveHorizontally_pointer)( - const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow, - bool hasAlpha); - -struct SkConvolutionProcs { - SkConvolveVertically_pointer fConvolveVertically; - SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally; - SkConvolveHorizontally_pointer fConvolveHorizontally; -}; - - - // Does a two-dimensional convolution on the given source image. // // It is assumed the source pixel offsets referenced in the input filters @@ -194,8 +168,6 @@ SK_API bool BGRAConvolve2D(const unsigned char* sourceData, const SkConvolutionFilter1D& xfilter, const SkConvolutionFilter1D& yfilter, int outputByteRowStride, - unsigned char* output, - const SkConvolutionProcs&, - bool useSimdIfPossible); + unsigned char* output); #endif // SK_CONVOLVER_H diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 6648909d1a..6c13dd05e1 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -36,6 +36,7 @@ #define SK_OPTS_NS portable #endif +#include "SkBitmapFilter_opts.h" #include "SkBlend_opts.h" #include "SkBlitMask_opts.h" #include "SkBlitRow_opts.h" @@ -90,6 +91,11 @@ namespace SkOpts { DEFINE_DEFAULT(hash_fn); DEFINE_DEFAULT(compile_pipeline); + + DEFINE_DEFAULT(convolve_vertically); + DEFINE_DEFAULT(convolve_horizontally); + DEFINE_DEFAULT(convolve_4_rows_horizontally); + #undef DEFINE_DEFAULT // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp. diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index 1d5e333729..22be97c332 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -8,6 +8,7 @@ #ifndef SkOpts_DEFINED #define SkOpts_DEFINED +#include "SkConvolver.h" #include "SkRasterPipeline.h" #include "SkTextureCompressor.h" #include "SkTypes.h" @@ -76,6 +77,15 @@ namespace SkOpts { extern std::function<void(size_t, size_t, size_t)> (*compile_pipeline)(const SkRasterPipeline::Stage*, int); + + extern void (*convolve_vertically)(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, + int filter_length, unsigned char* const* source_data_rows, + int pixel_width, unsigned char* out_row, bool has_alpha); + extern void (*convolve_4_rows_horizontally)(const unsigned char* src_data[4], + const SkConvolutionFilter1D& filter, + unsigned char* out_row[4], size_t out_row_bytes); + extern void (*convolve_horizontally)(const unsigned char* src_data, const SkConvolutionFilter1D& filter, + unsigned char* out_row, bool has_alpha); } #endif//SkOpts_DEFINED diff --git a/src/opts/SkBitmapFilter_opts.h b/src/opts/SkBitmapFilter_opts.h new file mode 100644 index 0000000000..f22b5c2368 --- /dev/null +++ b/src/opts/SkBitmapFilter_opts.h @@ -0,0 +1,940 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkBitmapFilter_opts_DEFINED +#define SkBitmapFilter_opts_DEFINED + +#include "SkConvolver.h" + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include <emmintrin.h> +#elif defined(SK_ARM_HAS_NEON) + #include <arm_neon.h> +#endif + +namespace SK_OPTS_NS { + +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + + static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, + const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum, int r) { + int remainder[4] = {0}; + for (int i = 0; i < r; i++) { + SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; + remainder[0] += coeff * pixelsLeft[i * 4 + 0]; + remainder[1] += coeff * pixelsLeft[i * 4 + 1]; + remainder[2] += coeff * pixelsLeft[i * 4 + 2]; + remainder[3] += coeff * pixelsLeft[i * 4 + 3]; + } + __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]); + accum = _mm_add_epi32(accum, t); + } + + // Convolves horizontally along a single row. The row data is given in + // |srcData| and continues for the numValues() of the filter. + void convolve_horizontally(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool /*hasAlpha*/) { + // Output one pixel each iteration, calculating all channels (RGBA) together. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + __m128i zero = _mm_setzero_si128(); + __m128i accum = _mm_setzero_si128(); + + // We will load and accumulate with four coefficients per iteration. + for (int filterX = 0; filterX < filterLength >> 2; filterX++) { + // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. + __m128i coeff, coeff16; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + + // Load four pixels => unpack the first two pixels to 16 bits => + // multiply with coefficients => accumulate the convolution result. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter)); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0*c0 b0*c0 g0*c0 r0*c0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a1*c1 b1*c1 g1*c1 r1*c1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Duplicate 3rd and 4th coefficients for all channels => + // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients + // => accumulate the convolution results. + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); + // [16] a3 g3 b3 r3 a2 g2 b2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2*c2 b2*c2 g2*c2 r2*c2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + // [32] a3*c3 b3*c3 g3*c3 r3*c3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum = _mm_add_epi32(accum, t); + + // Advance the pixel and coefficients pointers. + rowToFilter += 16; + filterValues += 4; + } + + // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3 + // coefficients one at a time. + int r = filterLength & 3; + if (r) { + int remainderOffset = (filterOffset + filterLength - r) * 4; + AccumRemainder(srcData + remainderOffset, filterValues, accum, r); + } + + // Shift right for fixed point implementation. + accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + accum = _mm_packs_epi32(accum, zero); + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + accum = _mm_packus_epi16(accum, zero); + + // Store the pixel value of 32 bits. + *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum); + outRow += 4; + } + } + + // Convolves horizontally along four rows. The row data is given in + // |srcData| and continues for the numValues() of the filter. + // The algorithm is almost same as |convolve_horizontally|. Please + // refer to that function for detailed comments. + void convolve_4_rows_horizontally(const unsigned char* srcData[4], + const SkConvolutionFilter1D& filter, + unsigned char* outRow[4], + size_t outRowBytes) { + SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];) + + // Output one pixel each iteration, calculating all channels (RGBA) together. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + __m128i zero = _mm_setzero_si128(); + + // four pixels in a column per iteration. + __m128i accum0 = _mm_setzero_si128(); + __m128i accum1 = _mm_setzero_si128(); + __m128i accum2 = _mm_setzero_si128(); + __m128i accum3 = _mm_setzero_si128(); + + int start = filterOffset * 4; + // We will load and accumulate with four coefficients per iteration. + for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { + __m128i coeff, coeff16lo, coeff16hi; + // [16] xx xx xx xx c3 c2 c1 c0 + coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues)); + // [16] xx xx xx xx c1 c1 c0 c0 + coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); + // [16] c1 c1 c1 c1 c0 c0 c0 c0 + coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); + // [16] xx xx xx xx c3 c3 c2 c2 + coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); + // [16] c3 c3 c3 c3 c2 c2 c2 c2 + coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); + + __m128i src8, src16, mul_hi, mul_lo, t; + +#define ITERATION(src, accum) \ + src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ + src16 = _mm_unpacklo_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ + mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + src16 = _mm_unpackhi_epi8(src8, zero); \ + mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ + mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ + t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t); \ + t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ + accum = _mm_add_epi32(accum, t) + + ITERATION(srcData[0] + start, accum0); + ITERATION(srcData[1] + start, accum1); + ITERATION(srcData[2] + start, accum2); + ITERATION(srcData[3] + start, accum3); + + start += 16; + filterValues += 4; + } + + int r = filterLength & 3; + if (r) { + int remainderOffset = (filterOffset + filterLength - r) * 4; + AccumRemainder(srcData[0] + remainderOffset, filterValues, accum0, r); + AccumRemainder(srcData[1] + remainderOffset, filterValues, accum1, r); + AccumRemainder(srcData[2] + remainderOffset, filterValues, accum2, r); + AccumRemainder(srcData[3] + remainderOffset, filterValues, accum3, r); + } + + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum0 = _mm_packs_epi32(accum0, zero); + accum0 = _mm_packus_epi16(accum0, zero); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_packs_epi32(accum1, zero); + accum1 = _mm_packus_epi16(accum1, zero); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_packs_epi32(accum2, zero); + accum2 = _mm_packus_epi16(accum2, zero); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_packs_epi32(accum3, zero); + accum3 = _mm_packus_epi16(accum3, zero); + + // We seem to be running off the edge here (chromium:491660). + SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes); + + *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0); + *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1); + *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2); + *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3); + + outRow[0] += 4; + outRow[1] += 4; + outRow[2] += 4; + outRow[3] += 4; + } + } + + // Does vertical convolution to produce one output row. The filter values and + // length are given in the first two parameters. These are applied to each + // of the rows pointed to in the |sourceDataRows| array, with each row + // being |pixelWidth| wide. + // + // The output must have room for |pixelWidth * 4| bytes. + template<bool hasAlpha> + void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + // Output four pixels per iteration (16 bytes). + int width = pixelWidth & ~3; + __m128i zero = _mm_setzero_si128(); + for (int outX = 0; outX < width; outX += 4) { + // Accumulated result for each pixel. 32 bits per RGBA channel. + __m128i accum0 = _mm_setzero_si128(); + __m128i accum1 = _mm_setzero_si128(); + __m128i accum2 = _mm_setzero_si128(); + __m128i accum3 = _mm_setzero_si128(); + + // Convolve with one filter coefficient per iteration. + for (int filterY = 0; filterY < filterLength; filterY++) { + + // Duplicate the filter coefficient 8 times. + // [16] cj cj cj cj cj cj cj cj + __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); + + // Load four pixels (16 bytes) together. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + const __m128i* src = reinterpret_cast<const __m128i*>( + &sourceDataRows[filterY][outX << 2]); + __m128i src8 = _mm_loadu_si128(src); + + // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + + // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => + // multiply with current coefficient => accumulate the result. + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + // [32] a3 b3 g3 r3 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum3 = _mm_add_epi32(accum3, t); + } + + // Shift right for fixed point implementation. + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, accum3); + + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + + if (hasAlpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum0 = _mm_max_epu8(b, accum0); + } else { + // Set value of alpha channels to 0xFF. + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } + + // Store the convolution result (16 bytes) and advance the pixel pointers. + _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0); + outRow += 16; + } + + // When the width of the output is not divisible by 4, We need to save one + // pixel (4 bytes) each time. And also the fourth pixel is always absent. + int r = pixelWidth & 3; + if (r) { + __m128i accum0 = _mm_setzero_si128(); + __m128i accum1 = _mm_setzero_si128(); + __m128i accum2 = _mm_setzero_si128(); + for (int filterY = 0; filterY < filterLength; ++filterY) { + __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + const __m128i* src = reinterpret_cast<const __m128i*>( + &sourceDataRows[filterY][width << 2]); + __m128i src8 = _mm_loadu_si128(src); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + __m128i src16 = _mm_unpacklo_epi8(src8, zero); + __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); + __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a0 b0 g0 r0 + __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum0 = _mm_add_epi32(accum0, t); + // [32] a1 b1 g1 r1 + t = _mm_unpackhi_epi16(mul_lo, mul_hi); + accum1 = _mm_add_epi32(accum1, t); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + src16 = _mm_unpackhi_epi8(src8, zero); + mul_hi = _mm_mulhi_epi16(src16, coeff16); + mul_lo = _mm_mullo_epi16(src16, coeff16); + // [32] a2 b2 g2 r2 + t = _mm_unpacklo_epi16(mul_lo, mul_hi); + accum2 = _mm_add_epi32(accum2, t); + } + + accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packs_epi32(accum0, accum1); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + accum2 = _mm_packs_epi32(accum2, zero); + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + accum0 = _mm_packus_epi16(accum0, accum2); + if (hasAlpha) { + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + __m128i a = _mm_srli_epi32(accum0, 8); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = _mm_srli_epi32(accum0, 16); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = _mm_max_epu8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = _mm_slli_epi32(b, 24); + accum0 = _mm_max_epu8(b, accum0); + } else { + __m128i mask = _mm_set1_epi32(0xff000000); + accum0 = _mm_or_si128(accum0, mask); + } + + for (int i = 0; i < r; i++) { + *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0); + accum0 = _mm_srli_si128(accum0, 4); + outRow += 4; + } + } + } + +#elif defined(SK_ARM_HAS_NEON) + + static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, + const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4_t& accum, int r) { + int remainder[4] = {0}; + for (int i = 0; i < r; i++) { + SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; + remainder[0] += coeff * pixelsLeft[i * 4 + 0]; + remainder[1] += coeff * pixelsLeft[i * 4 + 1]; + remainder[2] += coeff * pixelsLeft[i * 4 + 2]; + remainder[3] += coeff * pixelsLeft[i * 4 + 3]; + } + int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; + accum += t; + } + + // Convolves horizontally along a single row. The row data is given in + // |srcData| and continues for the numValues() of the filter. + void convolve_horizontally(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool /*hasAlpha*/) { + // Loop over each pixel on this row in the output image. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); + uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); + uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); + uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + // Apply the filter to the row to get the destination pixel in |accum|. + int32x4_t accum = vdupq_n_s32(0); + for (int filterX = 0; filterX < filterLength >> 2; filterX++) { + // Load 4 coefficients + int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; + coeffs = vld1_s16(filterValues); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); + coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); + + // Load pixels and calc + uint8x16_t pixels = vld1q_u8(rowToFilter); + int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); + int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); + + int16x4_t p0_src = vget_low_s16(p01_16); + int16x4_t p1_src = vget_high_s16(p01_16); + int16x4_t p2_src = vget_low_s16(p23_16); + int16x4_t p3_src = vget_high_s16(p23_16); + + int32x4_t p0 = vmull_s16(p0_src, coeff0); + int32x4_t p1 = vmull_s16(p1_src, coeff1); + int32x4_t p2 = vmull_s16(p2_src, coeff2); + int32x4_t p3 = vmull_s16(p3_src, coeff3); + + accum += p0; + accum += p1; + accum += p2; + accum += p3; + + // Advance the pointers + rowToFilter += 16; + filterValues += 4; + } + + int r = filterLength & 3; + if (r) { + int remainder_offset = (filterOffset + filterLength - r) * 4; + AccumRemainder(srcData + remainder_offset, filterValues, accum, r); + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of fractional part. + accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); + + // Pack and store the new pixel. + int16x4_t accum16 = vqmovn_s32(accum); + uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0); + outRow += 4; + } + } + + // Convolves horizontally along four rows. The row data is given in + // |srcData| and continues for the numValues() of the filter. + // The algorithm is almost same as |convolve_horizontally|. Please + // refer to that function for detailed comments. + void convolve_4_rows_horizontally(const unsigned char* srcData[4], + const SkConvolutionFilter1D& filter, + unsigned char* outRow[4], + size_t outRowBytes) { + // Output one pixel each iteration, calculating all channels (RGBA) together. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // four pixels in a column per iteration. + int32x4_t accum0 = vdupq_n_s32(0); + int32x4_t accum1 = vdupq_n_s32(0); + int32x4_t accum2 = vdupq_n_s32(0); + int32x4_t accum3 = vdupq_n_s32(0); + + uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); + uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); + uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); + uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); + + int start = filterOffset * 4; + + // We will load and accumulate with four coefficients per iteration. + for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { + int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; + + coeffs = vld1_s16(filterValues); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); + coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); + + uint8x16_t pixels; + int16x8_t p01_16, p23_16; + int32x4_t p0, p1, p2, p3; + +#define ITERATION(src, accum) \ + pixels = vld1q_u8(src); \ + p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ + p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ + p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ + p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ + p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ + p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ + accum += p0; \ + accum += p1; \ + accum += p2; \ + accum += p3 + + ITERATION(srcData[0] + start, accum0); + ITERATION(srcData[1] + start, accum1); + ITERATION(srcData[2] + start, accum2); + ITERATION(srcData[3] + start, accum3); + + start += 16; + filterValues += 4; + } + + int r = filterLength & 3; + if (r) { + int remainder_offset = (filterOffset + filterLength - r) * 4; + AccumRemainder(srcData[0] + remainder_offset, filterValues, accum0, r); + AccumRemainder(srcData[1] + remainder_offset, filterValues, accum1, r); + AccumRemainder(srcData[2] + remainder_offset, filterValues, accum2, r); + AccumRemainder(srcData[3] + remainder_offset, filterValues, accum3, r); + } + + int16x4_t accum16; + uint8x8_t res0, res1, res2, res3; + +#define PACK_RESULT(accum, res) \ + accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ + accum16 = vqmovn_s32(accum); \ + res = vqmovun_s16(vcombine_s16(accum16, accum16)); + + PACK_RESULT(accum0, res0); + PACK_RESULT(accum1, res1); + PACK_RESULT(accum2, res2); + PACK_RESULT(accum3, res3); + + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0); + outRow[0] += 4; + outRow[1] += 4; + outRow[2] += 4; + outRow[3] += 4; + } + } + + + // Does vertical convolution to produce one output row. The filter values and + // length are given in the first two parameters. These are applied to each + // of the rows pointed to in the |sourceDataRows| array, with each row + // being |pixelWidth| wide. + // + // The output must have room for |pixelWidth * 4| bytes. + template<bool hasAlpha> + void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + int width = pixelWidth & ~3; + + // Output four pixels per iteration (16 bytes). + for (int outX = 0; outX < width; outX += 4) { + + // Accumulated result for each pixel. 32 bits per RGBA channel. + int32x4_t accum0 = vdupq_n_s32(0); + int32x4_t accum1 = vdupq_n_s32(0); + int32x4_t accum2 = vdupq_n_s32(0); + int32x4_t accum3 = vdupq_n_s32(0); + + // Convolve with one filter coefficient per iteration. + for (int filterY = 0; filterY < filterLength; filterY++) { + + // Duplicate the filter coefficient 4 times. + // [16] cj cj cj cj + int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); + + // Load four pixels (16 bytes) together. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); + + int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); + int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); + int16x4_t src16_0 = vget_low_s16(src16_01); + int16x4_t src16_1 = vget_high_s16(src16_01); + int16x4_t src16_2 = vget_low_s16(src16_23); + int16x4_t src16_3 = vget_high_s16(src16_23); + + accum0 += vmull_s16(src16_0, coeff16); + accum1 += vmull_s16(src16_1, coeff16); + accum2 += vmull_s16(src16_2, coeff16); + accum3 += vmull_s16(src16_3, coeff16); + } + + // Shift right for fixed point implementation. + accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); + accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3)); + + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); + + if (hasAlpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = vmaxq_u8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum8 = vmaxq_u8(b, accum8); + } else { + // Set value of alpha channels to 0xFF. + accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); + } + + // Store the convolution result (16 bytes) and advance the pixel pointers. + vst1q_u8(outRow, accum8); + outRow += 16; + } + + // Process the leftovers when the width of the output is not divisible + // by 4, that is at most 3 pixels. + int r = pixelWidth & 3; + if (r) { + + int32x4_t accum0 = vdupq_n_s32(0); + int32x4_t accum1 = vdupq_n_s32(0); + int32x4_t accum2 = vdupq_n_s32(0); + + for (int filterY = 0; filterY < filterLength; ++filterY) { + int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); + + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); + + int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); + int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); + int16x4_t src16_0 = vget_low_s16(src16_01); + int16x4_t src16_1 = vget_high_s16(src16_01); + int16x4_t src16_2 = vget_low_s16(src16_23); + + accum0 += vmull_s16(src16_0, coeff16); + accum1 += vmull_s16(src16_1, coeff16); + accum2 += vmull_s16(src16_2, coeff16); + } + + accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); + + int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); + int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2)); + + uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); + + if (hasAlpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = vmaxq_u8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum8 = vmaxq_u8(b, accum8); + } else { + // Set value of alpha channels to 0xFF. + accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); + } + + switch(r) { + case 1: + vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0); + break; + case 2: + vst1_u32(reinterpret_cast<uint32_t*>(outRow), + vreinterpret_u32_u8(vget_low_u8(accum8))); + break; + case 3: + vst1_u32(reinterpret_cast<uint32_t*>(outRow), + vreinterpret_u32_u8(vget_low_u8(accum8))); + vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2); + break; + } + } + } + +#else + + // Converts the argument to an 8-bit unsigned value by clamping to the range + // 0-255. + inline unsigned char ClampTo8(int a) { + if (static_cast<unsigned>(a) < 256) { + return a; // Avoid the extra check in the common case. + } + if (a < 0) { + return 0; + } + return 255; + } + + // Convolves horizontally along a single row. The row data is given in + // |srcData| and continues for the numValues() of the filter. + template<bool hasAlpha> + void ConvolveHorizontally(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow) { + // Loop over each pixel on this row in the output image. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + // Apply the filter to the row to get the destination pixel in |accum|. + int accum[4] = {0}; + for (int filterX = 0; filterX < filterLength; filterX++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; + accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; + accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; + accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; + if (hasAlpha) { + accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of fractional part. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[outX * 4 + 0] = ClampTo8(accum[0]); + outRow[outX * 4 + 1] = ClampTo8(accum[1]); + outRow[outX * 4 + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + outRow[outX * 4 + 3] = ClampTo8(accum[3]); + } + } + } + + // Does vertical convolution to produce one output row. The filter values and + // length are given in the first two parameters. These are applied to each + // of the rows pointed to in the |sourceDataRows| array, with each row + // being |pixelWidth| wide. + // + // The output must have room for |pixelWidth * 4| bytes. + template<bool hasAlpha> + void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + // We go through each column in the output and do a vertical convolution, + // generating one output pixel each time. + for (int outX = 0; outX < pixelWidth; outX++) { + // Compute the number of bytes over in each row that the current column + // we're convolving starts at. The pixel will cover the next 4 bytes. + int byteOffset = outX * 4; + + // Apply the filter to one column of pixels. + int accum[4] = {0}; + for (int filterY = 0; filterY < filterLength; filterY++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; + accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; + accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; + accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; + if (hasAlpha) { + accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of precision. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[byteOffset + 0] = ClampTo8(accum[0]); + outRow[byteOffset + 1] = ClampTo8(accum[1]); + outRow[byteOffset + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + unsigned char alpha = ClampTo8(accum[3]); + + // Make sure the alpha channel doesn't come out smaller than any of the + // color channels. We use premultipled alpha channels, so this should + // never happen, but rounding errors will cause this from time to time. + // These "impossible" colors will cause overflows (and hence random pixel + // values) when the resulting bitmap is drawn to the screen. + // + // We only need to do this when generating the final output row (here). + int maxColorChannel = SkTMax(outRow[byteOffset + 0], + SkTMax(outRow[byteOffset + 1], + outRow[byteOffset + 2])); + if (alpha < maxColorChannel) { + outRow[byteOffset + 3] = maxColorChannel; + } else { + outRow[byteOffset + 3] = alpha; + } + } else { + // No alpha channel, the image is opaque. + outRow[byteOffset + 3] = 0xff; + } + } + } + + // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize). We originally + // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles + // suffer here too. + // + // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. https://bug.skia.org/2575 +#if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) + #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline)) +#else + #define SK_MAYBE_DISABLE_VECTORIZATION +#endif + + SK_MAYBE_DISABLE_VECTORIZATION + void convolve_horizontally(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool hasAlpha) { + if (hasAlpha) { + ConvolveHorizontally<true>(srcData, filter, outRow); + } else { + ConvolveHorizontally<false>(srcData, filter, outRow); + } + } +#undef SK_MAYBE_DISABLE_VECTORIZATION + + void (*convolve_4_rows_horizontally)(const unsigned char* srcData[4], + const SkConvolutionFilter1D& filter, + unsigned char* outRow[4], + size_t outRowBytes) + = nullptr; + + +#endif + + void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow, + bool hasAlpha) { + if (hasAlpha) { + ConvolveVertically<true>(filterValues, filterLength, sourceDataRows, + pixelWidth, outRow); + } else { + ConvolveVertically<false>(filterValues, filterLength, sourceDataRows, + pixelWidth, outRow); + } + } + +} // namespace SK_OPTS_NS + +#endif//SkBitmapFilter_opts_DEFINED diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp deleted file mode 100644 index 324ac1a5c1..0000000000 --- a/src/opts/SkBitmapFilter_opts_SSE2.cpp +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Copyright 2013 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#include <emmintrin.h> -#include "SkBitmap.h" -#include "SkBitmapFilter_opts_SSE2.h" -#include "SkBitmapProcState.h" -#include "SkColor.h" -#include "SkColorPriv.h" -#include "SkConvolver.h" -#include "SkShader.h" -#include "SkUnPreMultiply.h" - -#if 0 -static inline void print128i(__m128i value) { - int *v = (int*) &value; - printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); -} - -static inline void print128i_16(__m128i value) { - short *v = (short*) &value; - printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); -} - -static inline void print128i_8(__m128i value) { - unsigned char *v = (unsigned char*) &value; - printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n", - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] - ); -} - -static inline void print128f(__m128 value) { - float *f = (float*) &value; - printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); -} -#endif - -static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, - const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& accum, int r) { - int remainder[4] = {0}; - for (int i = 0; i < r; i++) { - SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; - remainder[0] += coeff * pixels_left[i * 4 + 0]; - remainder[1] += coeff * pixels_left[i * 4 + 1]; - remainder[2] += coeff * pixels_left[i * 4 + 2]; - remainder[3] += coeff * pixels_left[i * 4 + 3]; - } - __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]); - accum = _mm_add_epi32(accum, t); -} - -// Convolves horizontally along a single row. The row data is given in -// |src_data| and continues for the num_values() of the filter. -void convolveHorizontally_SSE2(const unsigned char* src_data, - const SkConvolutionFilter1D& filter, - unsigned char* out_row, - bool /*has_alpha*/) { - int num_values = filter.numValues(); - - int filter_offset, filter_length; - __m128i zero = _mm_setzero_si128(); - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = 0; out_x < num_values; out_x++) { - const SkConvolutionFilter1D::ConvolutionFixed* filter_values = - filter.FilterForValue(out_x, &filter_offset, &filter_length); - - __m128i accum = _mm_setzero_si128(); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filter_length| pixels (4 bytes each) after this. - const __m128i* row_to_filter = - reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); - - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { - - // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. - __m128i coeff, coeff16; - // [16] xx xx xx xx c3 c2 c1 c0 - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // [16] xx xx xx xx c1 c1 c0 c0 - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - // [16] c1 c1 c1 c1 c0 c0 c0 c0 - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - - // Load four pixels => unpack the first two pixels to 16 bits => - // multiply with coefficients => accumulate the convolution result. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src8 = _mm_loadu_si128(row_to_filter); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0*c0 b0*c0 g0*c0 r0*c0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - // [32] a1*c1 b1*c1 g1*c1 r1*c1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - - // Duplicate 3rd and 4th coefficients for all channels => - // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients - // => accumulate the convolution results. - // [16] xx xx xx xx c3 c3 c2 c2 - coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - // [16] c3 c3 c3 c3 c2 c2 c2 c2 - coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); - // [16] a3 g3 b3 r3 a2 g2 b2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2*c2 b2*c2 g2*c2 r2*c2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - // [32] a3*c3 b3*c3 g3*c3 r3*c3 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum = _mm_add_epi32(accum, t); - - // Advance the pixel and coefficients pointers. - row_to_filter += 1; - filter_values += 4; - } - - // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3 - // coefficients one at a time. - int r = filter_length & 3; - if (r) { - int remainder_offset = (filter_offset + filter_length - r) * 4; - accum_remainder(src_data + remainder_offset, filter_values, accum, r); - } - - // Shift right for fixed point implementation. - accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); - - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - accum = _mm_packs_epi32(accum, zero); - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - accum = _mm_packus_epi16(accum, zero); - - // Store the pixel value of 32 bits. - *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); - out_row += 4; - } -} - -// Convolves horizontally along four rows. The row data is given in -// |src_data| and continues for the num_values() of the filter. -// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please -// refer to that function for detailed comments. -void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], - const SkConvolutionFilter1D& filter, - unsigned char* out_row[4], - size_t outRowBytes) { - SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) - - int num_values = filter.numValues(); - - int filter_offset, filter_length; - __m128i zero = _mm_setzero_si128(); - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = 0; out_x < num_values; out_x++) { - const SkConvolutionFilter1D::ConvolutionFixed* filter_values = - filter.FilterForValue(out_x, &filter_offset, &filter_length); - - // four pixels in a column per iteration. - __m128i accum0 = _mm_setzero_si128(); - __m128i accum1 = _mm_setzero_si128(); - __m128i accum2 = _mm_setzero_si128(); - __m128i accum3 = _mm_setzero_si128(); - int start = (filter_offset<<2); - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { - __m128i coeff, coeff16lo, coeff16hi; - // [16] xx xx xx xx c3 c2 c1 c0 - coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); - // [16] xx xx xx xx c1 c1 c0 c0 - coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); - // [16] c1 c1 c1 c1 c0 c0 c0 c0 - coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); - // [16] xx xx xx xx c3 c3 c2 c2 - coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); - // [16] c3 c3 c3 c3 c2 c2 c2 c2 - coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); - - __m128i src8, src16, mul_hi, mul_lo, t; - -#define ITERATION(src, accum) \ - src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ - src16 = _mm_unpacklo_epi8(src8, zero); \ - mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ - mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ - t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - src16 = _mm_unpackhi_epi8(src8, zero); \ - mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ - mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ - t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t); \ - t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ - accum = _mm_add_epi32(accum, t) - - ITERATION(src_data[0] + start, accum0); - ITERATION(src_data[1] + start, accum1); - ITERATION(src_data[2] + start, accum2); - ITERATION(src_data[3] + start, accum3); - - start += 16; - filter_values += 4; - } - - int r = filter_length & 3; - if (r) { - int remainder_offset = (filter_offset + filter_length - r) * 4; - accum_remainder(src_data[0] + remainder_offset, filter_values, accum0, r); - accum_remainder(src_data[1] + remainder_offset, filter_values, accum1, r); - accum_remainder(src_data[2] + remainder_offset, filter_values, accum2, r); - accum_remainder(src_data[3] + remainder_offset, filter_values, accum3, r); - } - - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum0 = _mm_packs_epi32(accum0, zero); - accum0 = _mm_packus_epi16(accum0, zero); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_packs_epi32(accum1, zero); - accum1 = _mm_packus_epi16(accum1, zero); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_packs_epi32(accum2, zero); - accum2 = _mm_packus_epi16(accum2, zero); - accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); - accum3 = _mm_packs_epi32(accum3, zero); - accum3 = _mm_packus_epi16(accum3, zero); - - // We seem to be running off the edge here (chromium:491660). - SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes); - - *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); - *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); - *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); - *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); - - out_row[0] += 4; - out_row[1] += 4; - out_row[2] += 4; - out_row[3] += 4; - } -} - -// Does vertical convolution to produce one output row. The filter values and -// length are given in the first two parameters. These are applied to each -// of the rows pointed to in the |source_data_rows| array, with each row -// being |pixel_width| wide. -// -// The output must have room for |pixel_width * 4| bytes. -template<bool has_alpha> -void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, - int filter_length, - unsigned char* const* source_data_rows, - int pixel_width, - unsigned char* out_row) { - int width = pixel_width & ~3; - - __m128i zero = _mm_setzero_si128(); - __m128i accum0, accum1, accum2, accum3, coeff16; - const __m128i* src; - // Output four pixels per iteration (16 bytes). - for (int out_x = 0; out_x < width; out_x += 4) { - - // Accumulated result for each pixel. 32 bits per RGBA channel. - accum0 = _mm_setzero_si128(); - accum1 = _mm_setzero_si128(); - accum2 = _mm_setzero_si128(); - accum3 = _mm_setzero_si128(); - - // Convolve with one filter coefficient per iteration. - for (int filter_y = 0; filter_y < filter_length; filter_y++) { - - // Duplicate the filter coefficient 8 times. - // [16] cj cj cj cj cj cj cj cj - coeff16 = _mm_set1_epi16(filter_values[filter_y]); - - // Load four pixels (16 bytes) together. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - src = reinterpret_cast<const __m128i*>( - &source_data_rows[filter_y][out_x << 2]); - __m128i src8 = _mm_loadu_si128(src); - - // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => - // multiply with current coefficient => accumulate the result. - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0 b0 g0 r0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum0 = _mm_add_epi32(accum0, t); - // [32] a1 b1 g1 r1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum1 = _mm_add_epi32(accum1, t); - - // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => - // multiply with current coefficient => accumulate the result. - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2 b2 g2 r2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum2 = _mm_add_epi32(accum2, t); - // [32] a3 b3 g3 r3 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum3 = _mm_add_epi32(accum3, t); - } - - // Shift right for fixed point implementation. - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); - - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packs_epi32(accum0, accum1); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - accum2 = _mm_packs_epi32(accum2, accum3); - - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packus_epi16(accum0, accum2); - - if (has_alpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - __m128i a = _mm_srli_epi32(accum0, 8); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = _mm_srli_epi32(accum0, 16); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = _mm_max_epu8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = _mm_slli_epi32(b, 24); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum0 = _mm_max_epu8(b, accum0); - } else { - // Set value of alpha channels to 0xFF. - __m128i mask = _mm_set1_epi32(0xff000000); - accum0 = _mm_or_si128(accum0, mask); - } - - // Store the convolution result (16 bytes) and advance the pixel pointers. - _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); - out_row += 16; - } - - // When the width of the output is not divisible by 4, We need to save one - // pixel (4 bytes) each time. And also the fourth pixel is always absent. - if (pixel_width & 3) { - accum0 = _mm_setzero_si128(); - accum1 = _mm_setzero_si128(); - accum2 = _mm_setzero_si128(); - for (int filter_y = 0; filter_y < filter_length; ++filter_y) { - coeff16 = _mm_set1_epi16(filter_values[filter_y]); - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - src = reinterpret_cast<const __m128i*>( - &source_data_rows[filter_y][width<<2]); - __m128i src8 = _mm_loadu_si128(src); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - __m128i src16 = _mm_unpacklo_epi8(src8, zero); - __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); - __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a0 b0 g0 r0 - __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum0 = _mm_add_epi32(accum0, t); - // [32] a1 b1 g1 r1 - t = _mm_unpackhi_epi16(mul_lo, mul_hi); - accum1 = _mm_add_epi32(accum1, t); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - src16 = _mm_unpackhi_epi8(src8, zero); - mul_hi = _mm_mulhi_epi16(src16, coeff16); - mul_lo = _mm_mullo_epi16(src16, coeff16); - // [32] a2 b2 g2 r2 - t = _mm_unpacklo_epi16(mul_lo, mul_hi); - accum2 = _mm_add_epi32(accum2, t); - } - - accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packs_epi32(accum0, accum1); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - accum2 = _mm_packs_epi32(accum2, zero); - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - accum0 = _mm_packus_epi16(accum0, accum2); - if (has_alpha) { - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - __m128i a = _mm_srli_epi32(accum0, 8); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = _mm_srli_epi32(accum0, 16); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = _mm_max_epu8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = _mm_slli_epi32(b, 24); - accum0 = _mm_max_epu8(b, accum0); - } else { - __m128i mask = _mm_set1_epi32(0xff000000); - accum0 = _mm_or_si128(accum0, mask); - } - - for (int out_x = width; out_x < pixel_width; out_x++) { - *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); - accum0 = _mm_srli_si128(accum0, 4); - out_row += 4; - } - } -} - -void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, - int filter_length, - unsigned char* const* source_data_rows, - int pixel_width, - unsigned char* out_row, - bool has_alpha) { - if (has_alpha) { - convolveVertically_SSE2<true>(filter_values, - filter_length, - source_data_rows, - pixel_width, - out_row); - } else { - convolveVertically_SSE2<false>(filter_values, - filter_length, - source_data_rows, - pixel_width, - out_row); - } -} diff --git a/src/opts/SkBitmapFilter_opts_SSE2.h b/src/opts/SkBitmapFilter_opts_SSE2.h deleted file mode 100644 index 46ab5c8ae7..0000000000 --- a/src/opts/SkBitmapFilter_opts_SSE2.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2013 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkBitmapFilter_opts_sse2_DEFINED -#define SkBitmapFilter_opts_sse2_DEFINED - -#include "SkBitmapProcState.h" -#include "SkConvolver.h" - -void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values, - int filter_length, - unsigned char* const* source_data_rows, - int pixel_width, - unsigned char* out_row, - bool has_alpha); -void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], - const SkConvolutionFilter1D& filter, - unsigned char* out_row[4], - size_t outRowBytes); -void convolveHorizontally_SSE2(const unsigned char* src_data, - const SkConvolutionFilter1D& filter, - unsigned char* out_row, - bool has_alpha); -void applySIMDPadding_SSE2(SkConvolutionFilter1D* filter); - -#endif diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp index 4193e6af07..2f442514dd 100644 --- a/src/opts/SkBitmapProcState_arm_neon.cpp +++ b/src/opts/SkBitmapProcState_arm_neon.cpp @@ -77,382 +77,3 @@ const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = { SG8_alpha_D32_filter_DX_neon, SG8_alpha_D32_filter_DX_neon, }; - -/////////////////////////////////////////////////////////////////////////////// - -#include <arm_neon.h> -#include "SkConvolver.h" - -static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, - const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) { - int remainder[4] = {0}; - for (int i = 0; i < r; i++) { - SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; - remainder[0] += coeff * pixels_left[i * 4 + 0]; - remainder[1] += coeff * pixels_left[i * 4 + 1]; - remainder[2] += coeff * pixels_left[i * 4 + 2]; - remainder[3] += coeff * pixels_left[i * 4 + 3]; - } - int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; - accum += t; -} - -// Convolves horizontally along a single row. The row data is given in -// |srcData| and continues for the numValues() of the filter. -void convolveHorizontally_neon(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow, - bool hasAlpha) { - // Loop over each pixel on this row in the output image. - int numValues = filter.numValues(); - for (int outX = 0; outX < numValues; outX++) { - uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); - uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); - uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); - uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); - // Get the filter that determines the current output pixel. - int filterOffset, filterLength; - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filterLength| pixels (4 bytes each) after this. - const unsigned char* rowToFilter = &srcData[filterOffset * 4]; - - // Apply the filter to the row to get the destination pixel in |accum|. - int32x4_t accum = vdupq_n_s32(0); - for (int filterX = 0; filterX < filterLength >> 2; filterX++) { - // Load 4 coefficients - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; - coeffs = vld1_s16(filterValues); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); - - // Load pixels and calc - uint8x16_t pixels = vld1q_u8(rowToFilter); - int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); - int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); - - int16x4_t p0_src = vget_low_s16(p01_16); - int16x4_t p1_src = vget_high_s16(p01_16); - int16x4_t p2_src = vget_low_s16(p23_16); - int16x4_t p3_src = vget_high_s16(p23_16); - - int32x4_t p0 = vmull_s16(p0_src, coeff0); - int32x4_t p1 = vmull_s16(p1_src, coeff1); - int32x4_t p2 = vmull_s16(p2_src, coeff2); - int32x4_t p3 = vmull_s16(p3_src, coeff3); - - accum += p0; - accum += p1; - accum += p2; - accum += p3; - - // Advance the pointers - rowToFilter += 16; - filterValues += 4; - } - - int r = filterLength & 3; - if (r) { - int remainder_offset = (filterOffset + filterLength - r) * 4; - accum_remainder(srcData + remainder_offset, filterValues, accum, r); - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of fractional part. - accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); - - // Pack and store the new pixel. - int16x4_t accum16 = vqmovn_s32(accum); - uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0); - outRow += 4; - } -} - -// Does vertical convolution to produce one output row. The filter values and -// length are given in the first two parameters. These are applied to each -// of the rows pointed to in the |sourceDataRows| array, with each row -// being |pixelWidth| wide. -// -// The output must have room for |pixelWidth * 4| bytes. -template<bool hasAlpha> -void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow) { - int width = pixelWidth & ~3; - - int32x4_t accum0, accum1, accum2, accum3; - int16x4_t coeff16; - - // Output four pixels per iteration (16 bytes). - for (int outX = 0; outX < width; outX += 4) { - - // Accumulated result for each pixel. 32 bits per RGBA channel. - accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0); - - // Convolve with one filter coefficient per iteration. - for (int filterY = 0; filterY < filterLength; filterY++) { - - // Duplicate the filter coefficient 4 times. - // [16] cj cj cj cj - coeff16 = vdup_n_s16(filterValues[filterY]); - - // Load four pixels (16 bytes) together. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); - - int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); - int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); - int16x4_t src16_0 = vget_low_s16(src16_01); - int16x4_t src16_1 = vget_high_s16(src16_01); - int16x4_t src16_2 = vget_low_s16(src16_23); - int16x4_t src16_3 = vget_high_s16(src16_23); - - accum0 += vmull_s16(src16_0, coeff16); - accum1 += vmull_s16(src16_1, coeff16); - accum2 += vmull_s16(src16_2, coeff16); - accum3 += vmull_s16(src16_3, coeff16); - } - - // Shift right for fixed point implementation. - accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); - accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); - - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3)); - - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); - - if (hasAlpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = vmaxq_u8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum8 = vmaxq_u8(b, accum8); - } else { - // Set value of alpha channels to 0xFF. - accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); - } - - // Store the convolution result (16 bytes) and advance the pixel pointers. - vst1q_u8(outRow, accum8); - outRow += 16; - } - - // Process the leftovers when the width of the output is not divisible - // by 4, that is at most 3 pixels. - int r = pixelWidth & 3; - if (r) { - - accum0 = accum1 = accum2 = vdupq_n_s32(0); - - for (int filterY = 0; filterY < filterLength; ++filterY) { - coeff16 = vdup_n_s16(filterValues[filterY]); - - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); - - int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); - int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); - int16x4_t src16_0 = vget_low_s16(src16_01); - int16x4_t src16_1 = vget_high_s16(src16_01); - int16x4_t src16_2 = vget_low_s16(src16_23); - - accum0 += vmull_s16(src16_0, coeff16); - accum1 += vmull_s16(src16_1, coeff16); - accum2 += vmull_s16(src16_2, coeff16); - } - - accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); - - int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); - int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2)); - - uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); - - if (hasAlpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = vmaxq_u8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum8 = vmaxq_u8(b, accum8); - } else { - // Set value of alpha channels to 0xFF. - accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); - } - - switch(r) { - case 1: - vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0); - break; - case 2: - vst1_u32(reinterpret_cast<uint32_t*>(outRow), - vreinterpret_u32_u8(vget_low_u8(accum8))); - break; - case 3: - vst1_u32(reinterpret_cast<uint32_t*>(outRow), - vreinterpret_u32_u8(vget_low_u8(accum8))); - vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2); - break; - } - } -} - -void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow, - bool sourceHasAlpha) { - if (sourceHasAlpha) { - convolveVertically_neon<true>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } else { - convolveVertically_neon<false>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } -} - -// Convolves horizontally along four rows. The row data is given in -// |src_data| and continues for the num_values() of the filter. -// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please -// refer to that function for detailed comments. -void convolve4RowsHorizontally_neon(const unsigned char* srcData[4], - const SkConvolutionFilter1D& filter, - unsigned char* outRow[4], - size_t outRowBytes) { - - uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); - uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); - uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); - uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); - int num_values = filter.numValues(); - - int filterOffset, filterLength; - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int outX = 0; outX < num_values; outX++) { - - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // four pixels in a column per iteration. - int32x4_t accum0 = vdupq_n_s32(0); - int32x4_t accum1 = vdupq_n_s32(0); - int32x4_t accum2 = vdupq_n_s32(0); - int32x4_t accum3 = vdupq_n_s32(0); - - int start = (filterOffset<<2); - - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) { - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; - - coeffs = vld1_s16(filterValues); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); - - uint8x16_t pixels; - int16x8_t p01_16, p23_16; - int32x4_t p0, p1, p2, p3; - - -#define ITERATION(src, accum) \ - pixels = vld1q_u8(src); \ - p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ - p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ - p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ - p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ - p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ - p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ - accum += p0; \ - accum += p1; \ - accum += p2; \ - accum += p3 - - ITERATION(srcData[0] + start, accum0); - ITERATION(srcData[1] + start, accum1); - ITERATION(srcData[2] + start, accum2); - ITERATION(srcData[3] + start, accum3); - - start += 16; - filterValues += 4; - } - - int r = filterLength & 3; - if (r) { - int remainder_offset = (filterOffset + filterLength - r) * 4; - accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r); - accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r); - accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r); - accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r); - } - - int16x4_t accum16; - uint8x8_t res0, res1, res2, res3; - -#define PACK_RESULT(accum, res) \ - accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ - accum16 = vqmovn_s32(accum); \ - res = vqmovun_s16(vcombine_s16(accum16, accum16)); - - PACK_RESULT(accum0, res0); - PACK_RESULT(accum1, res1); - PACK_RESULT(accum2, res2); - PACK_RESULT(accum3, res3); - - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0); - outRow[0] += 4; - outRow[1] += 4; - outRow[2] += 4; - outRow[3] += 4; - } -} - -void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { - procs->fConvolveVertically = &convolveVertically_neon; - procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; - procs->fConvolveHorizontally = &convolveHorizontally_neon; -} diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp deleted file mode 100644 index e3726e7274..0000000000 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2009 The Android Open Source Project - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - - -#include "SkBitmapScaler.h" -#include "SkBitmapProcState.h" -#include "SkColorPriv.h" -#include "SkPaint.h" -#include "SkTypes.h" -#include "SkUtils.h" -#include "SkUtilsArm.h" - -#include "SkConvolver.h" - -void SkBitmapProcState::platformProcs() { } - -/////////////////////////////////////////////////////////////////////////////// - -extern void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs); - -void platformConvolutionProcs_arm(SkConvolutionProcs* procs) { -} - -void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) { - SK_ARM_NEON_WRAP(platformConvolutionProcs_arm)(procs); -} diff --git a/src/opts/SkBitmapProcState_opts_mips_dsp.cpp b/src/opts/SkBitmapProcState_opts_mips_dsp.cpp index 10f80c2a41..449fa61007 100644 --- a/src/opts/SkBitmapProcState_opts_mips_dsp.cpp +++ b/src/opts/SkBitmapProcState_opts_mips_dsp.cpp @@ -257,5 +257,3 @@ void SkBitmapProcState::platformProcs() { break; } } - -void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs*) {} diff --git a/src/opts/SkBitmapProcState_opts_none.cpp b/src/opts/SkBitmapProcState_opts_none.cpp index f2217f3503..0d96e17133 100644 --- a/src/opts/SkBitmapProcState_opts_none.cpp +++ b/src/opts/SkBitmapProcState_opts_none.cpp @@ -22,6 +22,3 @@ // empty implementation just uses default supplied function pointers void SkBitmapProcState::platformProcs() {} - -// empty implementation just uses default supplied function pointers -void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs*) {} diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 64cf0da2b7..4b8c8a118b 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -5,7 +5,6 @@ * found in the LICENSE file. */ -#include "SkBitmapFilter_opts_SSE2.h" #include "SkBitmapProcState_opts_SSE2.h" #include "SkBitmapProcState_opts_SSSE3.h" #include "SkBitmapScaler.h" @@ -35,16 +34,6 @@ //////////////////////////////////////////////////////////////////////////////// -void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) { - if (SkCpu::Supports(SkCpu::SSE2)) { - procs->fConvolveVertically = &convolveVertically_SSE2; - procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; - procs->fConvolveHorizontally = &convolveHorizontally_SSE2; - } -} - -//////////////////////////////////////////////////////////////////////////////// - void SkBitmapProcState::platformProcs() { /* Every optimization in the function requires at least SSE2 */ if (!SkCpu::Supports(SkCpu::SSE2)) { |