diff options
author | epoger@google.com <epoger@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-06 15:49:04 +0000 |
---|---|---|
committer | epoger@google.com <epoger@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-06 15:49:04 +0000 |
commit | 23ed4b307fbe7ebed1a5947d1d9be6cdec83e54b (patch) | |
tree | bdacd768328d27a29942e4808ed433531d7b8a3d /src/opts | |
parent | 1f0121af495e5a70ecff2521729b7749c81a20b2 (diff) |
Revert r12154
BUG=skia:1807
git-svn-id: http://skia.googlecode.com/svn/trunk@12156 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts')
-rw-r--r-- | src/opts/SkBitmapProcState_arm_neon.cpp | 420 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 384 |
2 files changed, 286 insertions, 518 deletions
diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp index c1ab9ed8cb..d50707dce0 100644 --- a/src/opts/SkBitmapProcState_arm_neon.cpp +++ b/src/opts/SkBitmapProcState_arm_neon.cpp @@ -90,423 +90,3 @@ const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = { // Don't support A8 -> 565 NULL, NULL, NULL, NULL }; - -/////////////////////////////////////////////////////////////////////////////// - -#include <arm_neon.h> -#include "SkConvolver.h" - -// Convolves horizontally along a single row. The row data is given in -// |srcData| and continues for the numValues() of the filter. -void convolveHorizontally_neon(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow, - bool hasAlpha) { - // Loop over each pixel on this row in the output image. - int numValues = filter.numValues(); - for (int outX = 0; outX < numValues; outX++) { - uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); - uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); - uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); - uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); - // Get the filter that determines the current output pixel. - int filterOffset, filterLength; - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filterLength| pixels (4 bytes each) after this. - const unsigned char* rowToFilter = &srcData[filterOffset * 4]; - - // Apply the filter to the row to get the destination pixel in |accum|. - int32x4_t accum = vdupq_n_s32(0); - for (int filterX = 0; filterX < filterLength >> 2; filterX++) { - // Load 4 coefficients - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; - coeffs = vld1_s16(filterValues); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); - - // Load pixels and calc - uint8x16_t pixels = vld1q_u8(rowToFilter); - int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); - int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); - - int16x4_t p0_src = vget_low_s16(p01_16); - int16x4_t p1_src = vget_high_s16(p01_16); - int16x4_t p2_src = vget_low_s16(p23_16); - int16x4_t p3_src = vget_high_s16(p23_16); - - int32x4_t p0 = vmull_s16(p0_src, coeff0); - int32x4_t p1 = vmull_s16(p1_src, coeff1); - int32x4_t p2 = vmull_s16(p2_src, coeff2); - int32x4_t p3 = vmull_s16(p3_src, coeff3); - - accum += p0; - accum += p1; - accum += p2; - accum += p3; - - // Advance the pointers - rowToFilter += 16; - filterValues += 4; - } - int r = filterLength & 3; - if (r) { - const uint16_t mask[4][4] = { - {0, 0, 0, 0}, - {0xFFFF, 0, 0, 0}, - {0xFFFF, 0xFFFF, 0, 0}, - {0xFFFF, 0xFFFF, 0xFFFF, 0} - }; - uint16x4_t coeffs; - int16x4_t coeff0, coeff1, coeff2, coeff3; - coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); - coeffs &= vld1_u16(&mask[r][0]); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2)); - - // Load pixels and calc - uint8x16_t pixels = vld1q_u8(rowToFilter); - int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); - int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); - int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0); - int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1); - int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2); - - accum += p0; - accum += p1; - accum += p2; - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of fractional part. - accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); - - // Pack and store the new pixel. - int16x4_t accum16 = vqmovn_s32(accum); - uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0); - outRow += 4; - } -} - -// Does vertical convolution to produce one output row. The filter values and -// length are given in the first two parameters. These are applied to each -// of the rows pointed to in the |sourceDataRows| array, with each row -// being |pixelWidth| wide. -// -// The output must have room for |pixelWidth * 4| bytes. -template<bool hasAlpha> -void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow) { - int width = pixelWidth & ~3; - - int32x4_t accum0, accum1, accum2, accum3; - int16x4_t coeff16; - - // Output four pixels per iteration (16 bytes). - for (int outX = 0; outX < width; outX += 4) { - - // Accumulated result for each pixel. 32 bits per RGBA channel. - accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0); - - // Convolve with one filter coefficient per iteration. - for (int filterY = 0; filterY < filterLength; filterY++) { - - // Duplicate the filter coefficient 4 times. - // [16] cj cj cj cj - coeff16 = vdup_n_s16(filterValues[filterY]); - - // Load four pixels (16 bytes) together. - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); - - int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); - int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); - int16x4_t src16_0 = vget_low_s16(src16_01); - int16x4_t src16_1 = vget_high_s16(src16_01); - int16x4_t src16_2 = vget_low_s16(src16_23); - int16x4_t src16_3 = vget_high_s16(src16_23); - - accum0 += vmull_s16(src16_0, coeff16); - accum1 += vmull_s16(src16_1, coeff16); - accum2 += vmull_s16(src16_2, coeff16); - accum3 += vmull_s16(src16_3, coeff16); - } - - // Shift right for fixed point implementation. - accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); - accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); - - // Packing 32 bits |accum| to 16 bits per channel (signed saturation). - // [16] a1 b1 g1 r1 a0 b0 g0 r0 - int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); - // [16] a3 b3 g3 r3 a2 b2 g2 r2 - int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3)); - - // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); - - if (hasAlpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = vmaxq_u8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum8 = vmaxq_u8(b, accum8); - } else { - // Set value of alpha channels to 0xFF. - accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); - } - - // Store the convolution result (16 bytes) and advance the pixel pointers. - vst1q_u8(outRow, accum8); - outRow += 16; - } - - // Process the leftovers when the width of the output is not divisible - // by 4, that is at most 3 pixels. - int r = pixelWidth & 3; - if (r) { - - accum0 = accum1 = accum2 = vdupq_n_s32(0); - - for (int filterY = 0; filterY < filterLength; ++filterY) { - coeff16 = vdup_n_s16(filterValues[filterY]); - - // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 - uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); - - int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); - int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); - int16x4_t src16_0 = vget_low_s16(src16_01); - int16x4_t src16_1 = vget_high_s16(src16_01); - int16x4_t src16_2 = vget_low_s16(src16_23); - - accum0 += vmull_s16(src16_0, coeff16); - accum1 += vmull_s16(src16_1, coeff16); - accum2 += vmull_s16(src16_2, coeff16); - } - - accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); - accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); - accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); - - int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); - int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2)); - - uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); - - if (hasAlpha) { - // Compute the max(ri, gi, bi) for each pixel. - // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 - uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g - // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 - a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); - // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 - b = vmaxq_u8(a, b); // Max of r and g and b. - // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 - b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); - - // Make sure the value of alpha channel is always larger than maximum - // value of color channels. - accum8 = vmaxq_u8(b, accum8); - } else { - // Set value of alpha channels to 0xFF. - accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); - } - - switch(r) { - case 1: - vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0); - break; - case 2: - vst1_u32(reinterpret_cast<uint32_t*>(outRow), - vreinterpret_u32_u8(vget_low_u8(accum8))); - break; - case 3: - vst1_u32(reinterpret_cast<uint32_t*>(outRow), - vreinterpret_u32_u8(vget_low_u8(accum8))); - vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2); - break; - } - } -} - -void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow, - bool sourceHasAlpha) { - if (sourceHasAlpha) { - convolveVertically_neon<true>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } else { - convolveVertically_neon<false>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } -} - -// Convolves horizontally along four rows. The row data is given in -// |src_data| and continues for the num_values() of the filter. -// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please -// refer to that function for detailed comments. -void convolve4RowsHorizontally_neon(const unsigned char* srcData[4], - const SkConvolutionFilter1D& filter, - unsigned char* outRow[4]) { - - uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); - uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); - uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); - uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); - int num_values = filter.numValues(); - - int filterOffset, filterLength; - // |mask| will be used to decimate all extra filter coefficients that are - // loaded by SIMD when |filter_length| is not divisible by 4. - // mask[0] is not used in following algorithm. - const uint16_t mask[4][4] = { - {0, 0, 0, 0}, - {0xFFFF, 0, 0, 0}, - {0xFFFF, 0xFFFF, 0, 0}, - {0xFFFF, 0xFFFF, 0xFFFF, 0} - }; - - // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int outX = 0; outX < num_values; outX++) { - - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // four pixels in a column per iteration. - int32x4_t accum0 = vdupq_n_s32(0); - int32x4_t accum1 = vdupq_n_s32(0); - int32x4_t accum2 = vdupq_n_s32(0); - int32x4_t accum3 = vdupq_n_s32(0); - - int start = (filterOffset<<2); - - // We will load and accumulate with four coefficients per iteration. - for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) { - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; - - coeffs = vld1_s16(filterValues); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); - - uint8x16_t pixels; - int16x8_t p01_16, p23_16; - int32x4_t p0, p1, p2, p3; - - -#define ITERATION(src, accum) \ - pixels = vld1q_u8(src); \ - p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ - p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ - p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ - p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ - p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ - p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ - accum += p0; \ - accum += p1; \ - accum += p2; \ - accum += p3 - - ITERATION(srcData[0] + start, accum0); - ITERATION(srcData[1] + start, accum1); - ITERATION(srcData[2] + start, accum2); - ITERATION(srcData[3] + start, accum3); - - start += 16; - filterValues += 4; - } - - int r = filterLength & 3; - if (r) { - int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; - coeffs = vld1_s16(filterValues); - coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); - coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); - coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); - coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); - coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); - - uint8x16_t pixels; - int16x8_t p01_16, p23_16; - int32x4_t p0, p1, p2, p3; - - ITERATION(srcData[0] + start, accum0); - ITERATION(srcData[1] + start, accum1); - ITERATION(srcData[2] + start, accum2); - ITERATION(srcData[3] + start, accum3); - } - - int16x4_t accum16; - uint8x8_t res0, res1, res2, res3; - -#define PACK_RESULT(accum, res) \ - accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ - accum16 = vqmovn_s32(accum); \ - res = vqmovun_s16(vcombine_s16(accum16, accum16)); - - PACK_RESULT(accum0, res0); - PACK_RESULT(accum1, res1); - PACK_RESULT(accum2, res2); - PACK_RESULT(accum3, res3); - - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0); - vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0); - outRow[0] += 4; - outRow[1] += 4; - outRow[2] += 4; - outRow[3] += 4; - } -} - -void applySIMDPadding_neon(SkConvolutionFilter1D *filter) { - // Padding |paddingCount| of more dummy coefficients after the coefficients - // of last filter to prevent SIMD instructions which load 8 or 16 bytes - // together to access invalid memory areas. We are not trying to align the - // coefficients right now due to the opaqueness of <vector> implementation. - // This has to be done after all |AddFilter| calls. - for (int i = 0; i < 8; ++i) { - filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0)); - } -} - -void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { - procs->fExtraHorizontalReads = 3; - procs->fConvolveVertically = &convolveVertically_neon; - procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; - procs->fConvolveHorizontally = &convolveHorizontally_neon; - procs->fApplySIMDPadding = &applySIMDPadding_neon; -} - diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index 96fbebd4e1..badb0f4d3b 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -50,54 +50,54 @@ void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s, const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1); asm volatile ( - "cmp %[count8], #0 \n\t" // compare loop counter with 0 - "beq 2f \n\t" // if loop counter == 0, exit - "1: \n\t" - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "subs %[count8], %[count8], #1 \n\t" // decrement loop counter - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup - "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap - "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup - "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap - "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup - "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap - "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup - "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap - "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup - "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap - "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap - "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap - "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap - "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 - "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 - "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 - "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 - "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels - "bgt 1b \n\t" // loop if counter > 0 - "2: \n\t" - : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); + "cmp %[count8], #0 \n\t" // compare loop counter with 0 + "beq 2f \n\t" // if loop counter == 0, exit + "1: \n\t" + "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 + "subs %[count8], %[count8], #1 \n\t" // decrement loop counter + "uxth r4, r5 \n\t" // extract ptr 0 + "mov r5, r5, lsr #16 \n\t" // extract ptr 1 + "uxth r6, r7 \n\t" // extract ptr 2 + "mov r7, r7, lsr #16 \n\t" // extract ptr 3 + "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image + "uxth r8, r9 \n\t" // extract ptr 4 + "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image + "mov r9, r9, lsr #16 \n\t" // extract ptr 5 + "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image + "uxth r10, r11 \n\t" // extract ptr 6 + "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image + "mov r11, r11, lsr #16 \n\t" // extract ptr 7 + "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup + "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image + "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup + "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image + "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup + "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image + "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup + "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap + "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup + "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap + "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup + "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap + "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup + "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap + "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup + "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap + "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap + "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap + "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap + "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 + "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 + "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 + "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 + "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels + "bgt 1b \n\t" // loop if counter > 0 + "2: \n\t" + : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) + : [table] "r" (table), [srcAddr] "r" (srcAddr) + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); for (i = (count & 7); i > 0; --i) { src = srcAddr[*xx++]; *colors++ = table[src]; @@ -136,52 +136,52 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, const uint16_t* xx = (const uint16_t*)(xy + 1); asm volatile ( - "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags - "blt 2f \n\t" // if count < 0, branch to singles - "1: \n\t" // eights loop - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap - "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap - "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap - "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap - "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap - "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap - "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap - "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap - "subs %[count], %[count], #8 \n\t" // decrement loop counter - "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels - "bge 1b \n\t" // loop if counter >= 0 - "2: \n\t" - "adds %[count], %[count], #8 \n\t" // fix up counter, set flags - "beq 4f \n\t" // if count == 0, branch to exit - "3: \n\t" // singles loop - "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr - "subs %[count], %[count], #1 \n\t" // decrement loop counter - "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image - "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap - "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr - "bne 3b \n\t" // loop if counter != 0 - "4: \n\t" // exit - : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); + "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags + "blt 2f \n\t" // if count < 0, branch to singles + "1: \n\t" // eights loop + "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 + "uxth r4, r5 \n\t" // extract ptr 0 + "mov r5, r5, lsr #16 \n\t" // extract ptr 1 + "uxth r6, r7 \n\t" // extract ptr 2 + "mov r7, r7, lsr #16 \n\t" // extract ptr 3 + "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image + "uxth r8, r9 \n\t" // extract ptr 4 + "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image + "mov r9, r9, lsr #16 \n\t" // extract ptr 5 + "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image + "uxth r10, r11 \n\t" // extract ptr 6 + "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image + "mov r11, r11, lsr #16 \n\t" // extract ptr 7 + "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image + "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image + "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image + "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap + "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap + "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap + "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap + "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap + "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap + "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap + "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap + "subs %[count], %[count], #8 \n\t" // decrement loop counter + "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels + "bge 1b \n\t" // loop if counter >= 0 + "2: \n\t" + "adds %[count], %[count], #8 \n\t" // fix up counter, set flags + "beq 4f \n\t" // if count == 0, branch to exit + "3: \n\t" // singles loop + "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr + "subs %[count], %[count], #1 \n\t" // decrement loop counter + "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image + "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap + "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr + "bne 3b \n\t" // loop if counter != 0 + "4: \n\t" // exit + : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) + : [table] "r" (table), [srcAddr] "r" (srcAddr) + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); } s.fBitmap->getColorTable()->unlockColors(); @@ -222,13 +222,201 @@ void SkBitmapProcState::platformProcs() { } } -/////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////// + +/* FUNCTIONS BELOW ARE SCALAR STUBS INTENDED FOR ARM DEVELOPERS TO REPLACE */ + +///////////////////////////////////// -extern void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs); -void platformConvolutionProcs_arm(SkConvolutionProcs* procs) { +static inline unsigned char ClampTo8(int a) { + if (static_cast<unsigned>(a) < 256) { + return a; // Avoid the extra check in the common case. + } + if (a < 0) { + return 0; + } + return 255; +} + +// Convolves horizontally along a single row. The row data is given in +// |srcData| and continues for the numValues() of the filter. +void convolveHorizontally_arm(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool hasAlpha) { + // Loop over each pixel on this row in the output image. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + // Apply the filter to the row to get the destination pixel in |accum|. + int accum[4] = {0}; + for (int filterX = 0; filterX < filterLength; filterX++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; + accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; + accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; + accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; + if (hasAlpha) { + accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of fractional part. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[outX * 4 + 0] = ClampTo8(accum[0]); + outRow[outX * 4 + 1] = ClampTo8(accum[1]); + outRow[outX * 4 + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + outRow[outX * 4 + 3] = ClampTo8(accum[3]); + } + } +} + +// Does vertical convolution to produce one output row. The filter values and +// length are given in the first two parameters. These are applied to each +// of the rows pointed to in the |sourceDataRows| array, with each row +// being |pixelWidth| wide. +// +// The output must have room for |pixelWidth * 4| bytes. +template<bool hasAlpha> + void convolveVertically_arm(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + // We go through each column in the output and do a vertical convolution, + // generating one output pixel each time. + for (int outX = 0; outX < pixelWidth; outX++) { + // Compute the number of bytes over in each row that the current column + // we're convolving starts at. The pixel will cover the next 4 bytes. + int byteOffset = outX * 4; + + // Apply the filter to one column of pixels. + int accum[4] = {0}; + for (int filterY = 0; filterY < filterLength; filterY++) { + SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; + accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; + accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; + accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; + if (hasAlpha) { + accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; + } + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of precision. + accum[0] >>= SkConvolutionFilter1D::kShiftBits; + accum[1] >>= SkConvolutionFilter1D::kShiftBits; + accum[2] >>= SkConvolutionFilter1D::kShiftBits; + if (hasAlpha) { + accum[3] >>= SkConvolutionFilter1D::kShiftBits; + } + + // Store the new pixel. + outRow[byteOffset + 0] = ClampTo8(accum[0]); + outRow[byteOffset + 1] = ClampTo8(accum[1]); + outRow[byteOffset + 2] = ClampTo8(accum[2]); + if (hasAlpha) { + unsigned char alpha = ClampTo8(accum[3]); + + // Make sure the alpha channel doesn't come out smaller than any of the + // color channels. We use premultipled alpha channels, so this should + // never happen, but rounding errors will cause this from time to time. + // These "impossible" colors will cause overflows (and hence random pixel + // values) when the resulting bitmap is drawn to the screen. + // + // We only need to do this when generating the final output row (here). + int maxColorChannel = SkTMax(outRow[byteOffset + 0], + SkTMax(outRow[byteOffset + 1], + outRow[byteOffset + 2])); + if (alpha < maxColorChannel) { + outRow[byteOffset + 3] = maxColorChannel; + } else { + outRow[byteOffset + 3] = alpha; + } + } else { + // No alpha channel, the image is opaque. + outRow[byteOffset + 3] = 0xff; + } + } + } + +void convolveVertically_arm(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow, + bool sourceHasAlpha) { + if (sourceHasAlpha) { + convolveVertically_arm<true>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } else { + convolveVertically_arm<false>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } +} + +// Convolves horizontally along four rows. The row data is given in +// |src_data| and continues for the num_values() of the filter. +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please +// refer to that function for detailed comments. +void convolve4RowsHorizontally_arm(const unsigned char* src_data[4], + const SkConvolutionFilter1D& filter, + unsigned char* out_row[4]) { +} + +/////////////////////////// + +/* STOP REWRITING FUNCTIONS HERE, BUT DON'T FORGET TO EDIT THE + PLATFORM CONVOLUTION PROCS BELOW */ + +/////////////////////////// + +void applySIMDPadding_arm(SkConvolutionFilter1D *filter) { + // Padding |paddingCount| of more dummy coefficients after the coefficients + // of last filter to prevent SIMD instructions which load 8 or 16 bytes + // together to access invalid memory areas. We are not trying to align the + // coefficients right now due to the opaqueness of <vector> implementation. + // This has to be done after all |AddFilter| calls. + for (int i = 0; i < 8; ++i) { + filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0)); + } } void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) { - SK_ARM_NEON_WRAP(platformConvolutionProcs_arm)(procs); + if (sk_cpu_arm_has_neon()) { + procs->fExtraHorizontalReads = 3; + procs->fConvolveVertically = &convolveVertically_arm; + + // next line is commented out because the four-row convolution function above is + // just a no-op. Please see the comment above its definition, and the SSE implementation + // in SkBitmapProcState_opts_SSE2.cpp for guidance on its semantics. + // leaving it as NULL will just cause the convolution system to not attempt + // to operate on four rows at once, which is correct but not performance-optimal. + + // procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_arm; + + procs->fConvolve4RowsHorizontally = NULL; + + procs->fConvolveHorizontally = &convolveHorizontally_arm; + procs->fApplySIMDPadding = &applySIMDPadding_arm; + } } |