diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-07 15:57:32 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-07 15:57:32 +0000 |
commit | c51808445fa21d8527444ff0fba202872be8c1b7 (patch) | |
tree | 00045e943b25cd351426a2cf8893c3298cb08823 /src | |
parent | 500db7c7172052a5574d6f3dc5d077ae8c6b4f85 (diff) |
ARM Skia NEON patches - 33 - Convolution filter
NEON version of the convolutionProcs
The bitmap_scale benchmark is now twice as fast on ARM.
Signed-off-by: Kévin PETIT <kevin.petit@arm.com>
BUG=
Committed: http://code.google.com/p/skia/source/detail?r=12154
R=djsollen@google.com, mtklein@google.com, humper@google.com, epoger@google.com
Author: kevin.petit.arm@gmail.com
Review URL: https://codereview.chromium.org/27533004
git-svn-id: http://skia.googlecode.com/svn/trunk@12166 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/SkBitmapProcState_arm_neon.cpp | 420 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 384 |
2 files changed, 518 insertions, 286 deletions
diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp index d50707dce0..35ba462c82 100644 --- a/src/opts/SkBitmapProcState_arm_neon.cpp +++ b/src/opts/SkBitmapProcState_arm_neon.cpp @@ -90,3 +90,423 @@ const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = { // Don't support A8 -> 565 NULL, NULL, NULL, NULL }; + +/////////////////////////////////////////////////////////////////////////////// + +#include <arm_neon.h> +#include "SkConvolver.h" + +// Convolves horizontally along a single row. The row data is given in +// |srcData| and continues for the numValues() of the filter. +void convolveHorizontally_neon(const unsigned char* srcData, + const SkConvolutionFilter1D& filter, + unsigned char* outRow, + bool hasAlpha) { + // Loop over each pixel on this row in the output image. + int numValues = filter.numValues(); + for (int outX = 0; outX < numValues; outX++) { + uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); + uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); + uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); + uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); + // Get the filter that determines the current output pixel. + int filterOffset, filterLength; + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // Compute the first pixel in this row that the filter affects. It will + // touch |filterLength| pixels (4 bytes each) after this. + const unsigned char* rowToFilter = &srcData[filterOffset * 4]; + + // Apply the filter to the row to get the destination pixel in |accum|. + int32x4_t accum = vdupq_n_s32(0); + for (int filterX = 0; filterX < filterLength >> 2; filterX++) { + // Load 4 coefficients + int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; + coeffs = vld1_s16(filterValues); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); + coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); + + // Load pixels and calc + uint8x16_t pixels = vld1q_u8(rowToFilter); + int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); + int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); + + int16x4_t p0_src = vget_low_s16(p01_16); + int16x4_t p1_src = vget_high_s16(p01_16); + int16x4_t p2_src = vget_low_s16(p23_16); + int16x4_t p3_src = vget_high_s16(p23_16); + + int32x4_t p0 = vmull_s16(p0_src, coeff0); + int32x4_t p1 = vmull_s16(p1_src, coeff1); + int32x4_t p2 = vmull_s16(p2_src, coeff2); + int32x4_t p3 = vmull_s16(p3_src, coeff3); + + accum += p0; + accum += p1; + accum += p2; + accum += p3; + + // Advance the pointers + rowToFilter += 16; + filterValues += 4; + } + int r = filterLength & 3; + if (r) { + const uint16_t mask[4][4] = { + {0, 0, 0, 0}, + {0xFFFF, 0, 0, 0}, + {0xFFFF, 0xFFFF, 0, 0}, + {0xFFFF, 0xFFFF, 0xFFFF, 0} + }; + uint16x4_t coeffs; + int16x4_t coeff0, coeff1, coeff2; + coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues)); + coeffs &= vld1_u16(&mask[r][0]); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2)); + + // Load pixels and calc + uint8x16_t pixels = vld1q_u8(rowToFilter); + int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); + int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); + int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0); + int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1); + int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2); + + accum += p0; + accum += p1; + accum += p2; + } + + // Bring this value back in range. All of the filter scaling factors + // are in fixed point with kShiftBits bits of fractional part. + accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); + + // Pack and store the new pixel. + int16x4_t accum16 = vqmovn_s32(accum); + uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0); + outRow += 4; + } +} + +// Does vertical convolution to produce one output row. The filter values and +// length are given in the first two parameters. These are applied to each +// of the rows pointed to in the |sourceDataRows| array, with each row +// being |pixelWidth| wide. +// +// The output must have room for |pixelWidth * 4| bytes. +template<bool hasAlpha> +void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow) { + int width = pixelWidth & ~3; + + int32x4_t accum0, accum1, accum2, accum3; + int16x4_t coeff16; + + // Output four pixels per iteration (16 bytes). + for (int outX = 0; outX < width; outX += 4) { + + // Accumulated result for each pixel. 32 bits per RGBA channel. + accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0); + + // Convolve with one filter coefficient per iteration. + for (int filterY = 0; filterY < filterLength; filterY++) { + + // Duplicate the filter coefficient 4 times. + // [16] cj cj cj cj + coeff16 = vdup_n_s16(filterValues[filterY]); + + // Load four pixels (16 bytes) together. + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); + + int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); + int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); + int16x4_t src16_0 = vget_low_s16(src16_01); + int16x4_t src16_1 = vget_high_s16(src16_01); + int16x4_t src16_2 = vget_low_s16(src16_23); + int16x4_t src16_3 = vget_high_s16(src16_23); + + accum0 += vmull_s16(src16_0, coeff16); + accum1 += vmull_s16(src16_1, coeff16); + accum2 += vmull_s16(src16_2, coeff16); + accum3 += vmull_s16(src16_3, coeff16); + } + + // Shift right for fixed point implementation. + accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); + accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); + + // Packing 32 bits |accum| to 16 bits per channel (signed saturation). + // [16] a1 b1 g1 r1 a0 b0 g0 r0 + int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); + // [16] a3 b3 g3 r3 a2 b2 g2 r2 + int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3)); + + // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); + + if (hasAlpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = vmaxq_u8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum8 = vmaxq_u8(b, accum8); + } else { + // Set value of alpha channels to 0xFF. + accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); + } + + // Store the convolution result (16 bytes) and advance the pixel pointers. + vst1q_u8(outRow, accum8); + outRow += 16; + } + + // Process the leftovers when the width of the output is not divisible + // by 4, that is at most 3 pixels. + int r = pixelWidth & 3; + if (r) { + + accum0 = accum1 = accum2 = vdupq_n_s32(0); + + for (int filterY = 0; filterY < filterLength; ++filterY) { + coeff16 = vdup_n_s16(filterValues[filterY]); + + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); + + int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); + int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); + int16x4_t src16_0 = vget_low_s16(src16_01); + int16x4_t src16_1 = vget_high_s16(src16_01); + int16x4_t src16_2 = vget_low_s16(src16_23); + + accum0 += vmull_s16(src16_0, coeff16); + accum1 += vmull_s16(src16_1, coeff16); + accum2 += vmull_s16(src16_2, coeff16); + } + + accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); + accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); + accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); + + int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); + int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2)); + + uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); + + if (hasAlpha) { + // Compute the max(ri, gi, bi) for each pixel. + // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 + uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g + // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 + a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); + // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 + b = vmaxq_u8(a, b); // Max of r and g and b. + // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 + b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); + + // Make sure the value of alpha channel is always larger than maximum + // value of color channels. + accum8 = vmaxq_u8(b, accum8); + } else { + // Set value of alpha channels to 0xFF. + accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000)); + } + + switch(r) { + case 1: + vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0); + break; + case 2: + vst1_u32(reinterpret_cast<uint32_t*>(outRow), + vreinterpret_u32_u8(vget_low_u8(accum8))); + break; + case 3: + vst1_u32(reinterpret_cast<uint32_t*>(outRow), + vreinterpret_u32_u8(vget_low_u8(accum8))); + vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2); + break; + } + } +} + +void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, + int filterLength, + unsigned char* const* sourceDataRows, + int pixelWidth, + unsigned char* outRow, + bool sourceHasAlpha) { + if (sourceHasAlpha) { + convolveVertically_neon<true>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } else { + convolveVertically_neon<false>(filterValues, filterLength, + sourceDataRows, pixelWidth, + outRow); + } +} + +// Convolves horizontally along four rows. The row data is given in +// |src_data| and continues for the num_values() of the filter. +// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please +// refer to that function for detailed comments. +void convolve4RowsHorizontally_neon(const unsigned char* srcData[4], + const SkConvolutionFilter1D& filter, + unsigned char* outRow[4]) { + + uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); + uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); + uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); + uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); + int num_values = filter.numValues(); + + int filterOffset, filterLength; + // |mask| will be used to decimate all extra filter coefficients that are + // loaded by SIMD when |filter_length| is not divisible by 4. + // mask[0] is not used in following algorithm. + const uint16_t mask[4][4] = { + {0, 0, 0, 0}, + {0xFFFF, 0, 0, 0}, + {0xFFFF, 0xFFFF, 0, 0}, + {0xFFFF, 0xFFFF, 0xFFFF, 0} + }; + + // Output one pixel each iteration, calculating all channels (RGBA) together. + for (int outX = 0; outX < num_values; outX++) { + + const SkConvolutionFilter1D::ConvolutionFixed* filterValues = + filter.FilterForValue(outX, &filterOffset, &filterLength); + + // four pixels in a column per iteration. + int32x4_t accum0 = vdupq_n_s32(0); + int32x4_t accum1 = vdupq_n_s32(0); + int32x4_t accum2 = vdupq_n_s32(0); + int32x4_t accum3 = vdupq_n_s32(0); + + int start = (filterOffset<<2); + + // We will load and accumulate with four coefficients per iteration. + for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) { + int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; + + coeffs = vld1_s16(filterValues); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); + coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); + + uint8x16_t pixels; + int16x8_t p01_16, p23_16; + int32x4_t p0, p1, p2, p3; + + +#define ITERATION(src, accum) \ + pixels = vld1q_u8(src); \ + p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ + p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ + p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ + p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ + p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ + p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ + accum += p0; \ + accum += p1; \ + accum += p2; \ + accum += p3 + + ITERATION(srcData[0] + start, accum0); + ITERATION(srcData[1] + start, accum1); + ITERATION(srcData[2] + start, accum2); + ITERATION(srcData[3] + start, accum3); + + start += 16; + filterValues += 4; + } + + int r = filterLength & 3; + if (r) { + int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; + coeffs = vld1_s16(filterValues); + coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0])); + coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); + coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); + coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); + coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); + + uint8x16_t pixels; + int16x8_t p01_16, p23_16; + int32x4_t p0, p1, p2, p3; + + ITERATION(srcData[0] + start, accum0); + ITERATION(srcData[1] + start, accum1); + ITERATION(srcData[2] + start, accum2); + ITERATION(srcData[3] + start, accum3); + } + + int16x4_t accum16; + uint8x8_t res0, res1, res2, res3; + +#define PACK_RESULT(accum, res) \ + accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ + accum16 = vqmovn_s32(accum); \ + res = vqmovun_s16(vcombine_s16(accum16, accum16)); + + PACK_RESULT(accum0, res0); + PACK_RESULT(accum1, res1); + PACK_RESULT(accum2, res2); + PACK_RESULT(accum3, res3); + + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0); + vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0); + outRow[0] += 4; + outRow[1] += 4; + outRow[2] += 4; + outRow[3] += 4; + } +} + +void applySIMDPadding_neon(SkConvolutionFilter1D *filter) { + // Padding |paddingCount| of more dummy coefficients after the coefficients + // of last filter to prevent SIMD instructions which load 8 or 16 bytes + // together to access invalid memory areas. We are not trying to align the + // coefficients right now due to the opaqueness of <vector> implementation. + // This has to be done after all |AddFilter| calls. + for (int i = 0; i < 8; ++i) { + filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0)); + } +} + +void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) { + procs->fExtraHorizontalReads = 3; + procs->fConvolveVertically = &convolveVertically_neon; + procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon; + procs->fConvolveHorizontally = &convolveHorizontally_neon; + procs->fApplySIMDPadding = &applySIMDPadding_neon; +} + diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index badb0f4d3b..96fbebd4e1 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -50,54 +50,54 @@ void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s, const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1); asm volatile ( - "cmp %[count8], #0 \n\t" // compare loop counter with 0 - "beq 2f \n\t" // if loop counter == 0, exit - "1: \n\t" - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "subs %[count8], %[count8], #1 \n\t" // decrement loop counter - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup - "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap - "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup - "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap - "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup - "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap - "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup - "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap - "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup - "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap - "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap - "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap - "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap - "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 - "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 - "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 - "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 - "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels - "bgt 1b \n\t" // loop if counter > 0 - "2: \n\t" - : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); + "cmp %[count8], #0 \n\t" // compare loop counter with 0 + "beq 2f \n\t" // if loop counter == 0, exit + "1: \n\t" + "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 + "subs %[count8], %[count8], #1 \n\t" // decrement loop counter + "uxth r4, r5 \n\t" // extract ptr 0 + "mov r5, r5, lsr #16 \n\t" // extract ptr 1 + "uxth r6, r7 \n\t" // extract ptr 2 + "mov r7, r7, lsr #16 \n\t" // extract ptr 3 + "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image + "uxth r8, r9 \n\t" // extract ptr 4 + "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image + "mov r9, r9, lsr #16 \n\t" // extract ptr 5 + "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image + "uxth r10, r11 \n\t" // extract ptr 6 + "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image + "mov r11, r11, lsr #16 \n\t" // extract ptr 7 + "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup + "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image + "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup + "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image + "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup + "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image + "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup + "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap + "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup + "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap + "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup + "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap + "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup + "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap + "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup + "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap + "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap + "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap + "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap + "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 + "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 + "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 + "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 + "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels + "bgt 1b \n\t" // loop if counter > 0 + "2: \n\t" + : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) + : [table] "r" (table), [srcAddr] "r" (srcAddr) + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); for (i = (count & 7); i > 0; --i) { src = srcAddr[*xx++]; *colors++ = table[src]; @@ -136,52 +136,52 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, const uint16_t* xx = (const uint16_t*)(xy + 1); asm volatile ( - "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags - "blt 2f \n\t" // if count < 0, branch to singles - "1: \n\t" // eights loop - "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 - "uxth r4, r5 \n\t" // extract ptr 0 - "mov r5, r5, lsr #16 \n\t" // extract ptr 1 - "uxth r6, r7 \n\t" // extract ptr 2 - "mov r7, r7, lsr #16 \n\t" // extract ptr 3 - "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image - "uxth r8, r9 \n\t" // extract ptr 4 - "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image - "mov r9, r9, lsr #16 \n\t" // extract ptr 5 - "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image - "uxth r10, r11 \n\t" // extract ptr 6 - "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image - "mov r11, r11, lsr #16 \n\t" // extract ptr 7 - "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image - "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image - "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image - "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap - "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap - "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap - "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap - "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap - "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap - "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap - "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap - "subs %[count], %[count], #8 \n\t" // decrement loop counter - "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels - "bge 1b \n\t" // loop if counter >= 0 - "2: \n\t" - "adds %[count], %[count], #8 \n\t" // fix up counter, set flags - "beq 4f \n\t" // if count == 0, branch to exit - "3: \n\t" // singles loop - "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr - "subs %[count], %[count], #1 \n\t" // decrement loop counter - "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image - "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap - "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr - "bne 3b \n\t" // loop if counter != 0 - "4: \n\t" // exit - : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) - : [table] "r" (table), [srcAddr] "r" (srcAddr) - : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); + "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags + "blt 2f \n\t" // if count < 0, branch to singles + "1: \n\t" // eights loop + "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 + "uxth r4, r5 \n\t" // extract ptr 0 + "mov r5, r5, lsr #16 \n\t" // extract ptr 1 + "uxth r6, r7 \n\t" // extract ptr 2 + "mov r7, r7, lsr #16 \n\t" // extract ptr 3 + "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image + "uxth r8, r9 \n\t" // extract ptr 4 + "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image + "mov r9, r9, lsr #16 \n\t" // extract ptr 5 + "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image + "uxth r10, r11 \n\t" // extract ptr 6 + "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image + "mov r11, r11, lsr #16 \n\t" // extract ptr 7 + "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image + "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image + "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image + "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap + "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap + "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap + "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap + "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap + "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap + "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap + "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap + "subs %[count], %[count], #8 \n\t" // decrement loop counter + "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels + "bge 1b \n\t" // loop if counter >= 0 + "2: \n\t" + "adds %[count], %[count], #8 \n\t" // fix up counter, set flags + "beq 4f \n\t" // if count == 0, branch to exit + "3: \n\t" // singles loop + "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr + "subs %[count], %[count], #1 \n\t" // decrement loop counter + "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image + "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap + "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr + "bne 3b \n\t" // loop if counter != 0 + "4: \n\t" // exit + : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) + : [table] "r" (table), [srcAddr] "r" (srcAddr) + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); } s.fBitmap->getColorTable()->unlockColors(); @@ -222,201 +222,13 @@ void SkBitmapProcState::platformProcs() { } } -///////////////////////////////////// - -/* FUNCTIONS BELOW ARE SCALAR STUBS INTENDED FOR ARM DEVELOPERS TO REPLACE */ - -///////////////////////////////////// - - -static inline unsigned char ClampTo8(int a) { - if (static_cast<unsigned>(a) < 256) { - return a; // Avoid the extra check in the common case. - } - if (a < 0) { - return 0; - } - return 255; -} - -// Convolves horizontally along a single row. The row data is given in -// |srcData| and continues for the numValues() of the filter. -void convolveHorizontally_arm(const unsigned char* srcData, - const SkConvolutionFilter1D& filter, - unsigned char* outRow, - bool hasAlpha) { - // Loop over each pixel on this row in the output image. - int numValues = filter.numValues(); - for (int outX = 0; outX < numValues; outX++) { - // Get the filter that determines the current output pixel. - int filterOffset, filterLength; - const SkConvolutionFilter1D::ConvolutionFixed* filterValues = - filter.FilterForValue(outX, &filterOffset, &filterLength); - - // Compute the first pixel in this row that the filter affects. It will - // touch |filterLength| pixels (4 bytes each) after this. - const unsigned char* rowToFilter = &srcData[filterOffset * 4]; - - // Apply the filter to the row to get the destination pixel in |accum|. - int accum[4] = {0}; - for (int filterX = 0; filterX < filterLength; filterX++) { - SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; - accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; - accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; - accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; - if (hasAlpha) { - accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; - } - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of fractional part. - accum[0] >>= SkConvolutionFilter1D::kShiftBits; - accum[1] >>= SkConvolutionFilter1D::kShiftBits; - accum[2] >>= SkConvolutionFilter1D::kShiftBits; - if (hasAlpha) { - accum[3] >>= SkConvolutionFilter1D::kShiftBits; - } - - // Store the new pixel. - outRow[outX * 4 + 0] = ClampTo8(accum[0]); - outRow[outX * 4 + 1] = ClampTo8(accum[1]); - outRow[outX * 4 + 2] = ClampTo8(accum[2]); - if (hasAlpha) { - outRow[outX * 4 + 3] = ClampTo8(accum[3]); - } - } -} - -// Does vertical convolution to produce one output row. The filter values and -// length are given in the first two parameters. These are applied to each -// of the rows pointed to in the |sourceDataRows| array, with each row -// being |pixelWidth| wide. -// -// The output must have room for |pixelWidth * 4| bytes. -template<bool hasAlpha> - void convolveVertically_arm(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow) { - // We go through each column in the output and do a vertical convolution, - // generating one output pixel each time. - for (int outX = 0; outX < pixelWidth; outX++) { - // Compute the number of bytes over in each row that the current column - // we're convolving starts at. The pixel will cover the next 4 bytes. - int byteOffset = outX * 4; - - // Apply the filter to one column of pixels. - int accum[4] = {0}; - for (int filterY = 0; filterY < filterLength; filterY++) { - SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; - accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; - accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; - accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; - if (hasAlpha) { - accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; - } - } - - // Bring this value back in range. All of the filter scaling factors - // are in fixed point with kShiftBits bits of precision. - accum[0] >>= SkConvolutionFilter1D::kShiftBits; - accum[1] >>= SkConvolutionFilter1D::kShiftBits; - accum[2] >>= SkConvolutionFilter1D::kShiftBits; - if (hasAlpha) { - accum[3] >>= SkConvolutionFilter1D::kShiftBits; - } - - // Store the new pixel. - outRow[byteOffset + 0] = ClampTo8(accum[0]); - outRow[byteOffset + 1] = ClampTo8(accum[1]); - outRow[byteOffset + 2] = ClampTo8(accum[2]); - if (hasAlpha) { - unsigned char alpha = ClampTo8(accum[3]); - - // Make sure the alpha channel doesn't come out smaller than any of the - // color channels. We use premultipled alpha channels, so this should - // never happen, but rounding errors will cause this from time to time. - // These "impossible" colors will cause overflows (and hence random pixel - // values) when the resulting bitmap is drawn to the screen. - // - // We only need to do this when generating the final output row (here). - int maxColorChannel = SkTMax(outRow[byteOffset + 0], - SkTMax(outRow[byteOffset + 1], - outRow[byteOffset + 2])); - if (alpha < maxColorChannel) { - outRow[byteOffset + 3] = maxColorChannel; - } else { - outRow[byteOffset + 3] = alpha; - } - } else { - // No alpha channel, the image is opaque. - outRow[byteOffset + 3] = 0xff; - } - } - } - -void convolveVertically_arm(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, - int filterLength, - unsigned char* const* sourceDataRows, - int pixelWidth, - unsigned char* outRow, - bool sourceHasAlpha) { - if (sourceHasAlpha) { - convolveVertically_arm<true>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } else { - convolveVertically_arm<false>(filterValues, filterLength, - sourceDataRows, pixelWidth, - outRow); - } -} - -// Convolves horizontally along four rows. The row data is given in -// |src_data| and continues for the num_values() of the filter. -// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please -// refer to that function for detailed comments. -void convolve4RowsHorizontally_arm(const unsigned char* src_data[4], - const SkConvolutionFilter1D& filter, - unsigned char* out_row[4]) { -} - -/////////////////////////// - -/* STOP REWRITING FUNCTIONS HERE, BUT DON'T FORGET TO EDIT THE - PLATFORM CONVOLUTION PROCS BELOW */ +/////////////////////////////////////////////////////////////////////////////// -/////////////////////////// +extern void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs); -void applySIMDPadding_arm(SkConvolutionFilter1D *filter) { - // Padding |paddingCount| of more dummy coefficients after the coefficients - // of last filter to prevent SIMD instructions which load 8 or 16 bytes - // together to access invalid memory areas. We are not trying to align the - // coefficients right now due to the opaqueness of <vector> implementation. - // This has to be done after all |AddFilter| calls. - for (int i = 0; i < 8; ++i) { - filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0)); - } +void platformConvolutionProcs_arm(SkConvolutionProcs* procs) { } void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) { - if (sk_cpu_arm_has_neon()) { - procs->fExtraHorizontalReads = 3; - procs->fConvolveVertically = &convolveVertically_arm; - - // next line is commented out because the four-row convolution function above is - // just a no-op. Please see the comment above its definition, and the SSE implementation - // in SkBitmapProcState_opts_SSE2.cpp for guidance on its semantics. - // leaving it as NULL will just cause the convolution system to not attempt - // to operate on four rows at once, which is correct but not performance-optimal. - - // procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_arm; - - procs->fConvolve4RowsHorizontally = NULL; - - procs->fConvolveHorizontally = &convolveHorizontally_arm; - procs->fApplySIMDPadding = &applySIMDPadding_arm; - } + SK_ARM_NEON_WRAP(platformConvolutionProcs_arm)(procs); } |