aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-05-05 09:58:31 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-05-06 14:57:12 +0000
commitc6820383b2526de95296ed8436f76333e0651d75 (patch)
tree3b55e949fcb14cb7f4325469dc61077bc7764507 /src/opts
parent4c6e4103a246c27bdd1302a9c7fba64367758dcc (diff)
remove old 565 destination opts
This is not an important format, and the code is dead or close to it. The code is an occasional maintenance burden so I'd like it gone. Change-Id: I4ad921533abf3211e6a81e6e475b848795eea060 Reviewed-on: https://skia-review.googlesource.com/15600 Reviewed-by: Mike Reed <reed@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkBlitRow_opts_SSE2.cpp547
-rw-r--r--src/opts/SkBlitRow_opts_SSE2.h15
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp16
-rw-r--r--src/opts/SkBlitRow_opts_arm_neon.cpp1099
-rw-r--r--src/opts/SkBlitRow_opts_arm_neon.h2
-rw-r--r--src/opts/SkBlitRow_opts_none.cpp8
-rw-r--r--src/opts/opts_check_x86.cpp37
7 files changed, 0 insertions, 1724 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 7ce1fc9a80..7f03907d1c 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -103,75 +103,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
}
}
-void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
- SkASSERT(count > 0);
-
- uint32_t src_expand = (SkGetPackedG32(src) << 24) |
- (SkGetPackedR32(src) << 13) |
- (SkGetPackedB32(src) << 2);
- unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
-
- // Check if we have enough pixels to run SIMD
- if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
- __m128i* dst_wide;
- const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
- const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
- const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
- const __m128i scale_wide = _mm_set1_epi16(scale);
- const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
- const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
-
- // Align dst to an even 16 byte address (0-7 pixels)
- while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
- *dst = SkBlend32_RGB16(src_expand, *dst, scale);
- dst += 1;
- count--;
- }
-
- dst_wide = reinterpret_cast<__m128i*>(dst);
- do {
- // Load eight RGB565 pixels
- __m128i pixels = _mm_load_si128(dst_wide);
-
- // Mask out sub-pixels
- __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
- __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
- pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
- __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
-
- // Scale with alpha
- pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
- pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
- pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
-
- // Add src_X_wide and shift down again
- pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
- pixel_R = _mm_srli_epi16(pixel_R, 5);
- pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
- pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
- pixel_B = _mm_srli_epi16(pixel_B, 5);
-
- // Combine into RGB565 and store
- pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
- pixel_G = _mm_and_si128(pixel_G, mask_green);
- pixels = _mm_or_si128(pixel_R, pixel_G);
- pixels = _mm_or_si128(pixels, pixel_B);
- _mm_store_si128(dst_wide, pixels);
- count -= 8;
- dst_wide++;
- } while (count >= 8);
-
- dst = reinterpret_cast<uint16_t*>(dst_wide);
- }
-
- // Small loop to handle remaining pixels.
- while (count > 0) {
- *dst = SkBlend32_RGB16(src_expand, *dst, scale);
- dst += 1;
- count--;
- }
-}
-
// The following (left) shifts cause the top 5 bits of the mask components to
// line up with the corresponding components in an SkPMColor.
// Note that the mask's RGB16 order may differ from the SkPMColor order.
@@ -510,481 +441,3 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
width--;
}
}
-
-/* SSE2 version of S32_D565_Opaque()
- * portable version is in core/SkBlitRow_D16.cpp
- */
-void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 == alpha);
-
- if (count <= 0) {
- return;
- }
-
- if (count >= 8) {
- while (((size_t)dst & 0x0F) != 0) {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
-
- *dst++ = SkPixel32ToPixel16_ToU16(c);
- count--;
- }
-
- const __m128i* s = reinterpret_cast<const __m128i*>(src);
- __m128i* d = reinterpret_cast<__m128i*>(dst);
-
- while (count >= 8) {
- // Load 8 pixels of src.
- __m128i src_pixel1 = _mm_loadu_si128(s++);
- __m128i src_pixel2 = _mm_loadu_si128(s++);
-
- __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
- _mm_store_si128(d++, d_pixel);
- count -= 8;
- }
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<uint16_t*>(d);
- }
-
- if (count > 0) {
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- *dst++ = SkPixel32ToPixel16_ToU16(c);
- } while (--count != 0);
- }
-}
-
-/* SSE2 version of S32A_D565_Opaque()
- * portable version is in core/SkBlitRow_D16.cpp
- */
-void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 == alpha);
-
- if (count <= 0) {
- return;
- }
-
- if (count >= 8) {
- // Make dst 16 bytes alignment
- while (((size_t)dst & 0x0F) != 0) {
- SkPMColor c = *src++;
- if (c) {
- *dst = SkSrcOver32To16(c, *dst);
- }
- dst += 1;
- count--;
- }
-
- const __m128i* s = reinterpret_cast<const __m128i*>(src);
- __m128i* d = reinterpret_cast<__m128i*>(dst);
- __m128i var255 = _mm_set1_epi16(255);
- __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
- __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
- __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
-
- while (count >= 8) {
- // Load 8 pixels of src.
- __m128i src_pixel1 = _mm_loadu_si128(s++);
- __m128i src_pixel2 = _mm_loadu_si128(s++);
-
- // Check whether src pixels are equal to 0 and get the highest bit
- // of each byte of result, if src pixels are all zero, src_cmp1 and
- // src_cmp2 will be 0xFFFF.
- int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
- _mm_setzero_si128()));
- int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
- _mm_setzero_si128()));
- if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
- d++;
- count -= 8;
- continue;
- }
-
- // Load 8 pixels of dst.
- __m128i dst_pixel = _mm_load_si128(d);
-
- // Extract A from src.
- __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
- sa1 = _mm_srli_epi32(sa1, 24);
- __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
- sa2 = _mm_srli_epi32(sa2, 24);
- __m128i sa = _mm_packs_epi32(sa1, sa2);
-
- // Extract R from src.
- __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
- sr1 = _mm_srli_epi32(sr1, 24);
- __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
- sr2 = _mm_srli_epi32(sr2, 24);
- __m128i sr = _mm_packs_epi32(sr1, sr2);
-
- // Extract G from src.
- __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
- sg1 = _mm_srli_epi32(sg1, 24);
- __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
- sg2 = _mm_srli_epi32(sg2, 24);
- __m128i sg = _mm_packs_epi32(sg1, sg2);
-
- // Extract B from src.
- __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
- sb1 = _mm_srli_epi32(sb1, 24);
- __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
- sb2 = _mm_srli_epi32(sb2, 24);
- __m128i sb = _mm_packs_epi32(sb1, sb2);
-
- // Extract R G B from dst.
- __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
- dr = _mm_and_si128(dr, r16_mask);
- __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
- dg = _mm_and_si128(dg, g16_mask);
- __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
- db = _mm_and_si128(db, b16_mask);
-
- __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
-
- // Calculate R G B of result.
- // Original algorithm is in SkSrcOver32To16().
- dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
- dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
- dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
- dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
- db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
- db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
-
- // Pack R G B into 16-bit color.
- __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
-
- // Store 8 16-bit colors in dst.
- _mm_store_si128(d++, d_pixel);
- count -= 8;
- }
-
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<uint16_t*>(d);
- }
-
- if (count > 0) {
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- if (c) {
- *dst = SkSrcOver32To16(c, *dst);
- }
- dst += 1;
- } while (--count != 0);
- }
-}
-
-void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y) {
- SkASSERT(255 == alpha);
-
- if (count <= 0) {
- return;
- }
-
- if (count >= 8) {
- while (((size_t)dst & 0x0F) != 0) {
- DITHER_565_SCAN(y);
- SkPMColor c = *src++;
- SkPMColorAssert(c);
-
- unsigned dither = DITHER_VALUE(x);
- *dst++ = SkDitherRGB32To565(c, dither);
- DITHER_INC_X(x);
- count--;
- }
-
- unsigned short dither_value[8];
- __m128i dither;
-#ifdef ENABLE_DITHER_MATRIX_4X4
- const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
- dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
- dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
- dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
- dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
-#else
- const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
- dither_value[0] = dither_value[4] = (dither_scan
- >> (((x) & 3) << 2)) & 0xF;
- dither_value[1] = dither_value[5] = (dither_scan
- >> (((x + 1) & 3) << 2)) & 0xF;
- dither_value[2] = dither_value[6] = (dither_scan
- >> (((x + 2) & 3) << 2)) & 0xF;
- dither_value[3] = dither_value[7] = (dither_scan
- >> (((x + 3) & 3) << 2)) & 0xF;
-#endif
- dither = _mm_loadu_si128((__m128i*) dither_value);
-
- const __m128i* s = reinterpret_cast<const __m128i*>(src);
- __m128i* d = reinterpret_cast<__m128i*>(dst);
-
- while (count >= 8) {
- // Load 8 pixels of src.
- __m128i src_pixel1 = _mm_loadu_si128(s++);
- __m128i src_pixel2 = _mm_loadu_si128(s++);
-
- // Extract R from src.
- __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
- sr1 = _mm_srli_epi32(sr1, 24);
- __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
- sr2 = _mm_srli_epi32(sr2, 24);
- __m128i sr = _mm_packs_epi32(sr1, sr2);
-
- // SkDITHER_R32To565(sr, dither)
- __m128i sr_offset = _mm_srli_epi16(sr, 5);
- sr = _mm_add_epi16(sr, dither);
- sr = _mm_sub_epi16(sr, sr_offset);
- sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
-
- // Extract G from src.
- __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
- sg1 = _mm_srli_epi32(sg1, 24);
- __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
- sg2 = _mm_srli_epi32(sg2, 24);
- __m128i sg = _mm_packs_epi32(sg1, sg2);
-
- // SkDITHER_R32To565(sg, dither)
- __m128i sg_offset = _mm_srli_epi16(sg, 6);
- sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
- sg = _mm_sub_epi16(sg, sg_offset);
- sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
-
- // Extract B from src.
- __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
- sb1 = _mm_srli_epi32(sb1, 24);
- __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
- sb2 = _mm_srli_epi32(sb2, 24);
- __m128i sb = _mm_packs_epi32(sb1, sb2);
-
- // SkDITHER_R32To565(sb, dither)
- __m128i sb_offset = _mm_srli_epi16(sb, 5);
- sb = _mm_add_epi16(sb, dither);
- sb = _mm_sub_epi16(sb, sb_offset);
- sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
-
- // Pack and store 16-bit dst pixel.
- __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
- _mm_store_si128(d++, d_pixel);
-
- count -= 8;
- x += 8;
- }
-
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<uint16_t*>(d);
- }
-
- if (count > 0) {
- DITHER_565_SCAN(y);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
-
- unsigned dither = DITHER_VALUE(x);
- *dst++ = SkDitherRGB32To565(c, dither);
- DITHER_INC_X(x);
- } while (--count != 0);
- }
-}
-
-/* SSE2 version of S32A_D565_Opaque_Dither()
- * portable version is in core/SkBlitRow_D16.cpp
- */
-void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y) {
- SkASSERT(255 == alpha);
-
- if (count <= 0) {
- return;
- }
-
- if (count >= 8) {
- while (((size_t)dst & 0x0F) != 0) {
- DITHER_565_SCAN(y);
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- if (c) {
- unsigned a = SkGetPackedA32(c);
-
- int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
-
- unsigned sr = SkGetPackedR32(c);
- unsigned sg = SkGetPackedG32(c);
- unsigned sb = SkGetPackedB32(c);
- sr = SkDITHER_R32_FOR_565(sr, d);
- sg = SkDITHER_G32_FOR_565(sg, d);
- sb = SkDITHER_B32_FOR_565(sb, d);
-
- uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
- uint32_t dst_expanded = SkExpand_rgb_16(*dst);
- dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
- // now src and dst expanded are in g:11 r:10 x:1 b:10
- *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- }
- dst += 1;
- DITHER_INC_X(x);
- count--;
- }
-
- unsigned short dither_value[8];
- __m128i dither, dither_cur;
-#ifdef ENABLE_DITHER_MATRIX_4X4
- const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
- dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
- dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
- dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
- dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
-#else
- const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
- dither_value[0] = dither_value[4] = (dither_scan
- >> (((x) & 3) << 2)) & 0xF;
- dither_value[1] = dither_value[5] = (dither_scan
- >> (((x + 1) & 3) << 2)) & 0xF;
- dither_value[2] = dither_value[6] = (dither_scan
- >> (((x + 2) & 3) << 2)) & 0xF;
- dither_value[3] = dither_value[7] = (dither_scan
- >> (((x + 3) & 3) << 2)) & 0xF;
-#endif
- dither = _mm_loadu_si128((__m128i*) dither_value);
-
- const __m128i* s = reinterpret_cast<const __m128i*>(src);
- __m128i* d = reinterpret_cast<__m128i*>(dst);
- __m128i var256 = _mm_set1_epi16(256);
- __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
- __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
- __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
-
- while (count >= 8) {
- // Load 8 pixels of src and dst.
- __m128i src_pixel1 = _mm_loadu_si128(s++);
- __m128i src_pixel2 = _mm_loadu_si128(s++);
- __m128i dst_pixel = _mm_load_si128(d);
-
- // Extract A from src.
- __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
- sa1 = _mm_srli_epi32(sa1, 24);
- __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
- sa2 = _mm_srli_epi32(sa2, 24);
- __m128i sa = _mm_packs_epi32(sa1, sa2);
-
- // Calculate current dither value.
- dither_cur = _mm_mullo_epi16(dither,
- _mm_add_epi16(sa, _mm_set1_epi16(1)));
- dither_cur = _mm_srli_epi16(dither_cur, 8);
-
- // Extract R from src.
- __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
- sr1 = _mm_srli_epi32(sr1, 24);
- __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
- sr2 = _mm_srli_epi32(sr2, 24);
- __m128i sr = _mm_packs_epi32(sr1, sr2);
-
- // SkDITHER_R32_FOR_565(sr, d)
- __m128i sr_offset = _mm_srli_epi16(sr, 5);
- sr = _mm_add_epi16(sr, dither_cur);
- sr = _mm_sub_epi16(sr, sr_offset);
-
- // Expand sr.
- sr = _mm_slli_epi16(sr, 2);
-
- // Extract G from src.
- __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
- sg1 = _mm_srli_epi32(sg1, 24);
- __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
- sg2 = _mm_srli_epi32(sg2, 24);
- __m128i sg = _mm_packs_epi32(sg1, sg2);
-
- // sg = SkDITHER_G32_FOR_565(sg, d).
- __m128i sg_offset = _mm_srli_epi16(sg, 6);
- sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
- sg = _mm_sub_epi16(sg, sg_offset);
-
- // Expand sg.
- sg = _mm_slli_epi16(sg, 3);
-
- // Extract B from src.
- __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
- sb1 = _mm_srli_epi32(sb1, 24);
- __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
- sb2 = _mm_srli_epi32(sb2, 24);
- __m128i sb = _mm_packs_epi32(sb1, sb2);
-
- // sb = SkDITHER_B32_FOR_565(sb, d).
- __m128i sb_offset = _mm_srli_epi16(sb, 5);
- sb = _mm_add_epi16(sb, dither_cur);
- sb = _mm_sub_epi16(sb, sb_offset);
-
- // Expand sb.
- sb = _mm_slli_epi16(sb, 2);
-
- // Extract R G B from dst.
- __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
- dr = _mm_and_si128(dr, r16_mask);
- __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
- dg = _mm_and_si128(dg, g16_mask);
- __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
- db = _mm_and_si128(db, b16_mask);
-
- // SkAlpha255To256(255 - a) >> 3
- __m128i isa = _mm_sub_epi16(var256, sa);
- isa = _mm_srli_epi16(isa, 3);
-
- dr = _mm_mullo_epi16(dr, isa);
- dr = _mm_add_epi16(dr, sr);
- dr = _mm_srli_epi16(dr, 5);
-
- dg = _mm_mullo_epi16(dg, isa);
- dg = _mm_add_epi16(dg, sg);
- dg = _mm_srli_epi16(dg, 5);
-
- db = _mm_mullo_epi16(db, isa);
- db = _mm_add_epi16(db, sb);
- db = _mm_srli_epi16(db, 5);
-
- // Package and store dst pixel.
- __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
- _mm_store_si128(d++, d_pixel);
-
- count -= 8;
- x += 8;
- }
-
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<uint16_t*>(d);
- }
-
- if (count > 0) {
- DITHER_565_SCAN(y);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- if (c) {
- unsigned a = SkGetPackedA32(c);
-
- int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
-
- unsigned sr = SkGetPackedR32(c);
- unsigned sg = SkGetPackedG32(c);
- unsigned sb = SkGetPackedB32(c);
- sr = SkDITHER_R32_FOR_565(sr, d);
- sg = SkDITHER_G32_FOR_565(sg, d);
- sb = SkDITHER_B32_FOR_565(sb, d);
-
- uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
- uint32_t dst_expanded = SkExpand_rgb_16(*dst);
- dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
- // now src and dst expanded are in g:11 r:10 x:1 b:10
- *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- }
- dst += 1;
- DITHER_INC_X(x);
- } while (--count != 0);
- }
-}
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 652ff6ee09..cb93da6121 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -18,24 +18,9 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
-void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x,
- int y);
-
void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor);
void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
SkColor color, int width, SkPMColor opaqueDst);
-void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/);
-void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int /*x*/, int /*y*/);
-void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y);
-void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y);
#endif
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index d4b1d0dd88..543640a57f 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -10,26 +10,10 @@
#include "SkBlitRow_opts_arm_neon.h"
-extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = {
- nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-};
-
-extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = {
- nullptr, nullptr,
-};
-
extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
nullptr, nullptr, nullptr, nullptr,
};
-SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
- return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags];
-}
-
-SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
- return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_colorprocs_arm)[flags];
-}
-
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags];
}
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index b17cd19a33..7a9534ca84 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -17,860 +17,6 @@
#include "SkColor_opts_neon.h"
#include <arm_neon.h>
-#ifdef SK_CPU_ARM64
-static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
- uint8x8x4_t vsrc;
- uint8x8_t vsrc_0, vsrc_1, vsrc_2;
-
- asm (
- "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
- "mov %[vsrc0].8b, v0.8b \t\n"
- "mov %[vsrc1].8b, v1.8b \t\n"
- "mov %[vsrc2].8b, v2.8b \t\n"
- : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
- [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
- : : "v0", "v1", "v2", "v3"
- );
-
- vsrc.val[0] = vsrc_0;
- vsrc.val[1] = vsrc_1;
- vsrc.val[2] = vsrc_2;
-
- return vsrc;
-}
-
-static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
- uint8x8x4_t vsrc;
- uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
-
- asm (
- "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
- "mov %[vsrc0].8b, v0.8b \t\n"
- "mov %[vsrc1].8b, v1.8b \t\n"
- "mov %[vsrc2].8b, v2.8b \t\n"
- "mov %[vsrc3].8b, v3.8b \t\n"
- : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
- [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
- [src] "+&r" (src)
- : : "v0", "v1", "v2", "v3"
- );
-
- vsrc.val[0] = vsrc_0;
- vsrc.val[1] = vsrc_1;
- vsrc.val[2] = vsrc_2;
- vsrc.val[3] = vsrc_3;
-
- return vsrc;
-}
-#endif
-
-void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 == alpha);
-
- while (count >= 8) {
- uint8x8x4_t vsrc;
- uint16x8_t vdst;
-
- // Load
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_3(src);
-#else
- vsrc = vld4_u8((uint8_t*)src);
- src += 8;
-#endif
-
- // Convert src to 565
- vdst = SkPixel32ToPixel16_neon8(vsrc);
-
- // Store
- vst1q_u16(dst, vdst);
-
- // Prepare next iteration
- dst += 8;
- count -= 8;
- };
-
- // Leftovers
- while (count > 0) {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- *dst = SkPixel32ToPixel16_ToU16(c);
- dst++;
- count--;
- };
-}
-
-void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 > alpha);
-
- uint16x8_t vmask_blue, vscale;
-
- // prepare constants
- vscale = vdupq_n_u16(SkAlpha255To256(alpha));
- vmask_blue = vmovq_n_u16(0x1F);
-
- while (count >= 8) {
- uint8x8x4_t vsrc;
- uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
- uint16x8_t vres_r, vres_g, vres_b;
-
- // Load src
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_3(src);
-#else
- {
- register uint8x8_t d0 asm("d0");
- register uint8x8_t d1 asm("d1");
- register uint8x8_t d2 asm("d2");
- register uint8x8_t d3 asm("d3");
-
- asm (
- "vld4.8 {d0-d3},[%[src]]!"
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
- :
- );
- vsrc.val[0] = d0;
- vsrc.val[1] = d1;
- vsrc.val[2] = d2;
- }
-#endif
-
- // Load and unpack dst
- vdst = vld1q_u16(dst);
- vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
- vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
- vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
- vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
-
- // Shift src to 565 range
- vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
- vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
- vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
-
- // Scale src - dst
- vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
- vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
- vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
-
- vres_r = vshrq_n_u16(vres_r * vscale, 8);
- vres_g = vshrq_n_u16(vres_g * vscale, 8);
- vres_b = vshrq_n_u16(vres_b * vscale, 8);
-
- vres_r += vdst_r;
- vres_g += vdst_g;
- vres_b += vdst_b;
-
- // Combine
- vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
- vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
-
- // Store
- vst1q_u16(dst, vres_b);
- dst += 8;
- count -= 8;
- }
- if (count > 0) {
- int scale = SkAlpha255To256(alpha);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- uint16_t d = *dst;
- *dst++ = SkPackRGB16(
- SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
- SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
- SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
- } while (--count != 0);
- }
-}
-
-#ifdef SK_CPU_ARM32
-void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 == alpha);
-
- if (count >= 8) {
- int32_t tmp = 0;
-
- asm volatile (
- "ands %[tmp], %[count], #7 \n\t"
- "vmov.u8 d31, #1<<7 \n\t"
- "vld1.16 {q12}, [%[dst]] \n\t"
- "vld4.8 {d0-d3}, [%[src]] \n\t"
- // Thumb does not support the standard ARM conditional
- // instructions but instead requires the 'it' instruction
- // to signal conditional execution
- "it eq \n\t"
- "moveq %[tmp], #8 \n\t"
- "mov ip, %[dst] \n\t"
-
- "add %[src], %[src], %[tmp], LSL#2 \n\t"
- "add %[dst], %[dst], %[tmp], LSL#1 \n\t"
- "subs %[count], %[count], %[tmp] \n\t"
- "b 9f \n\t"
- // LOOP
- "2: \n\t"
-
- "vld1.16 {q12}, [%[dst]]! \n\t"
- "vld4.8 {d0-d3}, [%[src]]! \n\t"
- "vst1.16 {q10}, [ip] \n\t"
- "sub ip, %[dst], #8*2 \n\t"
- "subs %[count], %[count], #8 \n\t"
- "9: \n\t"
- "pld [%[dst],#32] \n\t"
- // expand 0565 q12 to 8888 {d4-d7}
- "vmovn.u16 d4, q12 \n\t"
- "vshr.u16 q11, q12, #5 \n\t"
- "vshr.u16 q10, q12, #6+5 \n\t"
- "vmovn.u16 d5, q11 \n\t"
- "vmovn.u16 d6, q10 \n\t"
- "vshl.u8 d4, d4, #3 \n\t"
- "vshl.u8 d5, d5, #2 \n\t"
- "vshl.u8 d6, d6, #3 \n\t"
-
- "vmovl.u8 q14, d31 \n\t"
- "vmovl.u8 q13, d31 \n\t"
- "vmovl.u8 q12, d31 \n\t"
-
- // duplicate in 4/2/1 & 8pix vsns
- "vmvn.8 d30, d3 \n\t"
- "vmlal.u8 q14, d30, d6 \n\t"
- "vmlal.u8 q13, d30, d5 \n\t"
- "vmlal.u8 q12, d30, d4 \n\t"
- "vshr.u16 q8, q14, #5 \n\t"
- "vshr.u16 q9, q13, #6 \n\t"
- "vaddhn.u16 d6, q14, q8 \n\t"
- "vshr.u16 q8, q12, #5 \n\t"
- "vaddhn.u16 d5, q13, q9 \n\t"
- "vaddhn.u16 d4, q12, q8 \n\t"
- // intentionally don't calculate alpha
- // result in d4-d6
-
- #ifdef SK_PMCOLOR_IS_RGBA
- "vqadd.u8 d6, d6, d0 \n\t"
- "vqadd.u8 d5, d5, d1 \n\t"
- "vqadd.u8 d4, d4, d2 \n\t"
- #else
- "vqadd.u8 d6, d6, d2 \n\t"
- "vqadd.u8 d5, d5, d1 \n\t"
- "vqadd.u8 d4, d4, d0 \n\t"
- #endif
-
- // pack 8888 {d4-d6} to 0565 q10
- "vshll.u8 q10, d6, #8 \n\t"
- "vshll.u8 q3, d5, #8 \n\t"
- "vshll.u8 q2, d4, #8 \n\t"
- "vsri.u16 q10, q3, #5 \n\t"
- "vsri.u16 q10, q2, #11 \n\t"
-
- "bne 2b \n\t"
-
- "1: \n\t"
- "vst1.16 {q10}, [ip] \n\t"
- : [count] "+r" (count)
- : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp)
- : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
- "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
- "d30","d31"
- );
- }
- else
- { // handle count < 8
- uint16_t* SK_RESTRICT keep_dst = 0;
-
- asm volatile (
- "vmov.u8 d31, #1<<7 \n\t"
- "mov %[keep_dst], %[dst] \n\t"
-
- "tst %[count], #4 \n\t"
- "beq 14f \n\t"
- "vld1.16 {d25}, [%[dst]]! \n\t"
- "vld1.32 {q1}, [%[src]]! \n\t"
-
- "14: \n\t"
- "tst %[count], #2 \n\t"
- "beq 12f \n\t"
- "vld1.32 {d24[1]}, [%[dst]]! \n\t"
- "vld1.32 {d1}, [%[src]]! \n\t"
-
- "12: \n\t"
- "tst %[count], #1 \n\t"
- "beq 11f \n\t"
- "vld1.16 {d24[1]}, [%[dst]]! \n\t"
- "vld1.32 {d0[1]}, [%[src]]! \n\t"
-
- "11: \n\t"
- // unzips achieve the same as a vld4 operation
- "vuzp.u16 q0, q1 \n\t"
- "vuzp.u8 d0, d1 \n\t"
- "vuzp.u8 d2, d3 \n\t"
- // expand 0565 q12 to 8888 {d4-d7}
- "vmovn.u16 d4, q12 \n\t"
- "vshr.u16 q11, q12, #5 \n\t"
- "vshr.u16 q10, q12, #6+5 \n\t"
- "vmovn.u16 d5, q11 \n\t"
- "vmovn.u16 d6, q10 \n\t"
- "vshl.u8 d4, d4, #3 \n\t"
- "vshl.u8 d5, d5, #2 \n\t"
- "vshl.u8 d6, d6, #3 \n\t"
-
- "vmovl.u8 q14, d31 \n\t"
- "vmovl.u8 q13, d31 \n\t"
- "vmovl.u8 q12, d31 \n\t"
-
- // duplicate in 4/2/1 & 8pix vsns
- "vmvn.8 d30, d3 \n\t"
- "vmlal.u8 q14, d30, d6 \n\t"
- "vmlal.u8 q13, d30, d5 \n\t"
- "vmlal.u8 q12, d30, d4 \n\t"
- "vshr.u16 q8, q14, #5 \n\t"
- "vshr.u16 q9, q13, #6 \n\t"
- "vaddhn.u16 d6, q14, q8 \n\t"
- "vshr.u16 q8, q12, #5 \n\t"
- "vaddhn.u16 d5, q13, q9 \n\t"
- "vaddhn.u16 d4, q12, q8 \n\t"
- // intentionally don't calculate alpha
- // result in d4-d6
-
- #ifdef SK_PMCOLOR_IS_RGBA
- "vqadd.u8 d6, d6, d0 \n\t"
- "vqadd.u8 d5, d5, d1 \n\t"
- "vqadd.u8 d4, d4, d2 \n\t"
- #else
- "vqadd.u8 d6, d6, d2 \n\t"
- "vqadd.u8 d5, d5, d1 \n\t"
- "vqadd.u8 d4, d4, d0 \n\t"
- #endif
-
- // pack 8888 {d4-d6} to 0565 q10
- "vshll.u8 q10, d6, #8 \n\t"
- "vshll.u8 q3, d5, #8 \n\t"
- "vshll.u8 q2, d4, #8 \n\t"
- "vsri.u16 q10, q3, #5 \n\t"
- "vsri.u16 q10, q2, #11 \n\t"
-
- // store
- "tst %[count], #4 \n\t"
- "beq 24f \n\t"
- "vst1.16 {d21}, [%[keep_dst]]! \n\t"
-
- "24: \n\t"
- "tst %[count], #2 \n\t"
- "beq 22f \n\t"
- "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
-
- "22: \n\t"
- "tst %[count], #1 \n\t"
- "beq 21f \n\t"
- "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
-
- "21: \n\t"
- : [count] "+r" (count)
- : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
- : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
- "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
- "d30","d31"
- );
- }
-}
-
-#else // #ifdef SK_CPU_ARM32
-
-void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 == alpha);
-
- if (count >= 16) {
- asm (
- "movi v4.8h, #0x80 \t\n"
-
- "1: \t\n"
- "sub %w[count], %w[count], #16 \t\n"
- "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n"
- "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n"
- "prfm pldl1keep, [%[src],#512] \t\n"
- "prfm pldl1keep, [%[dst],#256] \t\n"
- "ushr v20.8h, v17.8h, #5 \t\n"
- "ushr v31.8h, v16.8h, #5 \t\n"
- "xtn v6.8b, v31.8h \t\n"
- "xtn2 v6.16b, v20.8h \t\n"
- "ushr v20.8h, v17.8h, #11 \t\n"
- "shl v19.16b, v6.16b, #2 \t\n"
- "ushr v31.8h, v16.8h, #11 \t\n"
- "xtn v22.8b, v31.8h \t\n"
- "xtn2 v22.16b, v20.8h \t\n"
- "shl v18.16b, v22.16b, #3 \t\n"
- "mvn v3.16b, v3.16b \t\n"
- "xtn v16.8b, v16.8h \t\n"
- "mov v7.16b, v4.16b \t\n"
- "xtn2 v16.16b, v17.8h \t\n"
- "umlal v7.8h, v3.8b, v19.8b \t\n"
- "shl v16.16b, v16.16b, #3 \t\n"
- "mov v22.16b, v4.16b \t\n"
- "ushr v24.8h, v7.8h, #6 \t\n"
- "umlal v22.8h, v3.8b, v18.8b \t\n"
- "ushr v20.8h, v22.8h, #5 \t\n"
- "addhn v20.8b, v22.8h, v20.8h \t\n"
- "cmp %w[count], #16 \t\n"
- "mov v6.16b, v4.16b \t\n"
- "mov v5.16b, v4.16b \t\n"
- "umlal v6.8h, v3.8b, v16.8b \t\n"
- "umlal2 v5.8h, v3.16b, v19.16b \t\n"
- "mov v17.16b, v4.16b \t\n"
- "ushr v19.8h, v6.8h, #5 \t\n"
- "umlal2 v17.8h, v3.16b, v18.16b \t\n"
- "addhn v7.8b, v7.8h, v24.8h \t\n"
- "ushr v18.8h, v5.8h, #6 \t\n"
- "ushr v21.8h, v17.8h, #5 \t\n"
- "addhn2 v7.16b, v5.8h, v18.8h \t\n"
- "addhn2 v20.16b, v17.8h, v21.8h \t\n"
- "mov v22.16b, v4.16b \t\n"
- "addhn v6.8b, v6.8h, v19.8h \t\n"
- "umlal2 v22.8h, v3.16b, v16.16b \t\n"
- "ushr v5.8h, v22.8h, #5 \t\n"
- "addhn2 v6.16b, v22.8h, v5.8h \t\n"
- "uqadd v7.16b, v1.16b, v7.16b \t\n"
-#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
- "uqadd v20.16b, v2.16b, v20.16b \t\n"
- "uqadd v6.16b, v0.16b, v6.16b \t\n"
-#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
- "uqadd v20.16b, v0.16b, v20.16b \t\n"
- "uqadd v6.16b, v2.16b, v6.16b \t\n"
-#else
-#error "This function only supports BGRA and RGBA."
-#endif
- "shll v22.8h, v20.8b, #8 \t\n"
- "shll v5.8h, v7.8b, #8 \t\n"
- "sri v22.8h, v5.8h, #5 \t\n"
- "shll v17.8h, v6.8b, #8 \t\n"
- "shll2 v23.8h, v20.16b, #8 \t\n"
- "shll2 v7.8h, v7.16b, #8 \t\n"
- "sri v22.8h, v17.8h, #11 \t\n"
- "sri v23.8h, v7.8h, #5 \t\n"
- "shll2 v6.8h, v6.16b, #8 \t\n"
- "st1 {v22.8h}, [%[dst]], #16 \t\n"
- "sri v23.8h, v6.8h, #11 \t\n"
- "st1 {v23.8h}, [%[dst]], #16 \t\n"
- "b.ge 1b \t\n"
- : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
- :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
- "v31"
- );
- }
- // Leftovers
- if (count > 0) {
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- if (c) {
- *dst = SkSrcOver32To16(c, *dst);
- }
- dst += 1;
- } while (--count != 0);
- }
-}
-#endif // #ifdef SK_CPU_ARM32
-
-static uint32_t pmcolor_to_expand16(SkPMColor c) {
- unsigned r = SkGetPackedR32(c);
- unsigned g = SkGetPackedG32(c);
- unsigned b = SkGetPackedB32(c);
- return (g << 24) | (r << 13) | (b << 2);
-}
-
-void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
- uint32_t src_expand;
- unsigned scale;
- uint16x8_t vmask_blue;
-
- if (count <= 0) return;
- SkASSERT(((size_t)dst & 0x01) == 0);
-
- /*
- * This preamble code is in order to make dst aligned to 8 bytes
- * in the next mutiple bytes read & write access.
- */
- src_expand = pmcolor_to_expand16(src);
- scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
-
-#define DST_ALIGN 8
-
- /*
- * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
- */
- int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
-
- for (int i = 0; i < preamble_size; i+=2, dst++) {
- uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
- *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
- if (--count == 0)
- break;
- }
-
- int count16 = 0;
- count16 = count >> 4;
- vmask_blue = vmovq_n_u16(SK_B16_MASK);
-
- if (count16) {
- uint16x8_t wide_sr;
- uint16x8_t wide_sg;
- uint16x8_t wide_sb;
- uint16x8_t wide_256_sa;
-
- unsigned sr = SkGetPackedR32(src);
- unsigned sg = SkGetPackedG32(src);
- unsigned sb = SkGetPackedB32(src);
- unsigned sa = SkGetPackedA32(src);
-
- // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
- // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
- //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
- wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
-
- // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
- //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
- wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
-
- // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
- //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
- wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
-
- wide_256_sa =
- vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
-
- while (count16-- > 0) {
- uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
- uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
- vdst1 = vld1q_u16(dst);
- dst += 8;
- vdst2 = vld1q_u16(dst);
- dst -= 8; //to store dst again.
-
- vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes
- vdst1_b = vdst1 & vmask_blue; // extract blue
- vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red
- vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
-
- vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes
- vdst2_b = vdst2 & vmask_blue; // extract blue
- vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red
- vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
-
- vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1
- vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1
- vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1
-
- vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2
- vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2
- vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2
-
- vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red
- vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green
- vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue
-
- vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue
- vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue
-
- vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red
- vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green
- vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue
-
- vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue
- vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue
-
- vst1q_u16(dst, vdst1);
- dst += 8;
- vst1q_u16(dst, vdst2);
- dst += 8;
- }
- }
-
- count &= 0xF;
- if (count > 0) {
- do {
- uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
- *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
- dst += 1;
- } while (--count != 0);
- }
-}
-
-static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
- prod += vdupq_n_u16(128);
- prod += vshrq_n_u16(prod, 8);
- return vshrq_n_u16(prod, 8);
-}
-
-void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src, int count,
- U8CPU alpha, int /*x*/, int /*y*/) {
- SkASSERT(255 > alpha);
-
- /* This code implements a Neon version of S32A_D565_Blend. The results have
- * a few mismatches compared to the original code. These mismatches never
- * exceed 1.
- */
-
- if (count >= 8) {
- uint16x8_t valpha_max, vmask_blue;
- uint8x8_t valpha;
-
- // prepare constants
- valpha_max = vmovq_n_u16(255);
- valpha = vdup_n_u8(alpha);
- vmask_blue = vmovq_n_u16(SK_B16_MASK);
-
- do {
- uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
- uint16x8_t vres_a, vres_r, vres_g, vres_b;
- uint8x8x4_t vsrc;
-
- // load pixels
- vdst = vld1q_u16(dst);
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_4(src);
-#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
- asm (
- "vld4.u8 %h[vsrc], [%[src]]!"
- : [vsrc] "=w" (vsrc), [src] "+&r" (src)
- : :
- );
-#else
- register uint8x8_t d0 asm("d0");
- register uint8x8_t d1 asm("d1");
- register uint8x8_t d2 asm("d2");
- register uint8x8_t d3 asm("d3");
-
- asm volatile (
- "vld4.u8 {d0-d3},[%[src]]!;"
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
- [src] "+&r" (src)
- : :
- );
- vsrc.val[0] = d0;
- vsrc.val[1] = d1;
- vsrc.val[2] = d2;
- vsrc.val[3] = d3;
-#endif
-
-
- // deinterleave dst
- vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
- vdst_b = vdst & vmask_blue; // extract blue
- vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
- vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
-
- // shift src to 565
- vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
- vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
- vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
-
- // calc src * src_scale
- vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
- vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
- vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
- vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
-
- // prepare dst_scale
- vres_a = SkDiv255Round_neon8(vres_a);
- vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
-
- // add dst * dst_scale to previous result
- vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
- vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
- vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
-
-#ifdef S32A_D565_BLEND_EXACT
- // It is possible to get exact results with this but it is slow,
- // even slower than C code in some cases
- vres_r = SkDiv255Round_neon8(vres_r);
- vres_g = SkDiv255Round_neon8(vres_g);
- vres_b = SkDiv255Round_neon8(vres_b);
-#else
- vres_r = vrshrq_n_u16(vres_r, 8);
- vres_g = vrshrq_n_u16(vres_g, 8);
- vres_b = vrshrq_n_u16(vres_b, 8);
-#endif
- // pack result
- vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
- vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
-
- // store
- vst1q_u16(dst, vres_b);
- dst += 8;
- count -= 8;
- } while (count >= 8);
- }
-
- // leftovers
- while (count-- > 0) {
- SkPMColor sc = *src++;
- if (sc) {
- uint16_t dc = *dst;
- unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
- unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
- unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
- unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
- *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
- }
- dst += 1;
- }
-}
-
-/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
- * each dither value is spaced out into byte lanes, and repeated
- * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
- * start of each row.
- */
-static const uint8_t gDitherMatrix_Neon[48] = {
- 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
- 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
- 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
- 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
-
-};
-
-void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
- int count, U8CPU alpha, int x, int y)
-{
-
- SkASSERT(255 > alpha);
-
- // rescale alpha to range 1 - 256
- int scale = SkAlpha255To256(alpha);
-
- if (count >= 8) {
- /* select row and offset for dither array */
- const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
-
- uint8x8_t vdither = vld1_u8(dstart); // load dither values
- uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
-
- int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
- uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
-
- do {
-
- uint8x8x4_t vsrc;
- uint8x8_t vsrc_r, vsrc_g, vsrc_b;
- uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
- uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
- uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
- uint16x8_t vdst;
- uint16x8_t vdst_r, vdst_g, vdst_b;
- int16x8_t vres_r, vres_g, vres_b;
- int8x8_t vres8_r, vres8_g, vres8_b;
-
- // Load source and add dither
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_3(src);
-#else
- {
- register uint8x8_t d0 asm("d0");
- register uint8x8_t d1 asm("d1");
- register uint8x8_t d2 asm("d2");
- register uint8x8_t d3 asm("d3");
-
- asm (
- "vld4.8 {d0-d3},[%[src]]! "
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
- :
- );
- vsrc.val[0] = d0;
- vsrc.val[1] = d1;
- vsrc.val[2] = d2;
- }
-#endif
- vsrc_r = vsrc.val[NEON_R];
- vsrc_g = vsrc.val[NEON_G];
- vsrc_b = vsrc.val[NEON_B];
-
- vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
- vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
- vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
-
- vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
- vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
- vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
-
- vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
- vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
- vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
-
- vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
- vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
- vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
-
- // Load dst and unpack
- vdst = vld1q_u16(dst);
- vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
- vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
- vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
-
- // subtract dst from src and widen
- vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
- vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
- vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
-
- // multiply diffs by scale and shift
- vres_r = vmulq_s16(vres_r, vscale);
- vres_g = vmulq_s16(vres_g, vscale);
- vres_b = vmulq_s16(vres_b, vscale);
-
- vres8_r = vshrn_n_s16(vres_r, 8);
- vres8_g = vshrn_n_s16(vres_g, 8);
- vres8_b = vshrn_n_s16(vres_b, 8);
-
- // add dst to result
- vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
- vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
- vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
-
- // put result into 565 format
- vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
- vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
-
- // Store result
- vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
-
- // Next iteration
- dst += 8;
- count -= 8;
-
- } while (count >= 8);
- }
-
- // Leftovers
- if (count > 0) {
- int scale = SkAlpha255To256(alpha);
- DITHER_565_SCAN(y);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
-
- int dither = DITHER_VALUE(x);
- int sr = SkGetPackedR32(c);
- int sg = SkGetPackedG32(c);
- int sb = SkGetPackedB32(c);
- sr = SkDITHER_R32To565(sr, dither);
- sg = SkDITHER_G32To565(sg, dither);
- sb = SkDITHER_B32To565(sb, dither);
-
- uint16_t d = *dst;
- *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
- SkAlphaBlend(sg, SkGetPackedG16(d), scale),
- SkAlphaBlend(sb, SkGetPackedB16(d), scale));
- DITHER_INC_X(x);
- } while (--count != 0);
- }
-}
-
/* Neon version of S32_Blend_BlitRow32()
* portable version is in src/core/SkBlitRow_D32.cpp
*/
@@ -1042,253 +188,8 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
#endif // #ifdef SK_CPU_ARM32
-void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y) {
- SkASSERT(255 == alpha);
-
-#define UNROLL 8
-
- if (count >= UNROLL) {
-
- uint8x8_t dbase;
- const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
- dbase = vld1_u8(dstart);
-
- do {
- uint8x8x4_t vsrc;
- uint8x8_t sr, sg, sb, sa, d;
- uint16x8_t dst8, scale8, alpha8;
- uint16x8_t dst_r, dst_g, dst_b;
-
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_4(src);
-#else
- {
- register uint8x8_t d0 asm("d0");
- register uint8x8_t d1 asm("d1");
- register uint8x8_t d2 asm("d2");
- register uint8x8_t d3 asm("d3");
-
- asm ("vld4.8 {d0-d3},[%[src]]! "
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
- :
- );
- vsrc.val[0] = d0;
- vsrc.val[1] = d1;
- vsrc.val[2] = d2;
- vsrc.val[3] = d3;
- }
-#endif
- sa = vsrc.val[NEON_A];
- sr = vsrc.val[NEON_R];
- sg = vsrc.val[NEON_G];
- sb = vsrc.val[NEON_B];
-
- /* calculate 'd', which will be 0..7
- * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
- */
- alpha8 = vmovl_u8(dbase);
- alpha8 = vmlal_u8(alpha8, sa, dbase);
- d = vshrn_n_u16(alpha8, 8); // narrowing too
-
- // sr = sr - (sr>>5) + d
- /* watching for 8-bit overflow. d is 0..7; risky range of
- * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
- * safe as long as we do ((sr-sr>>5) + d)
- */
- sr = vsub_u8(sr, vshr_n_u8(sr, 5));
- sr = vadd_u8(sr, d);
-
- // sb = sb - (sb>>5) + d
- sb = vsub_u8(sb, vshr_n_u8(sb, 5));
- sb = vadd_u8(sb, d);
-
- // sg = sg - (sg>>6) + d>>1; similar logic for overflows
- sg = vsub_u8(sg, vshr_n_u8(sg, 6));
- sg = vadd_u8(sg, vshr_n_u8(d,1));
-
- // need to pick up 8 dst's -- at 16 bits each, 128 bits
- dst8 = vld1q_u16(dst);
- dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
- dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
- dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
-
- // blend
- scale8 = vsubw_u8(vdupq_n_u16(256), sa);
-
- // combine the addq and mul, save 3 insns
- scale8 = vshrq_n_u16(scale8, 3);
- dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
- dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
- dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
-
- // repack to store
- dst8 = vshrq_n_u16(dst_b, 5);
- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
-
- vst1q_u16(dst, dst8);
-
- dst += UNROLL;
- count -= UNROLL;
- // skip x += UNROLL, since it's unchanged mod-4
- } while (count >= UNROLL);
- }
-#undef UNROLL
-
- // residuals
- if (count > 0) {
- DITHER_565_SCAN(y);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- if (c) {
- unsigned a = SkGetPackedA32(c);
-
- // dither and alpha are just temporary variables to work-around
- // an ICE in debug.
- unsigned dither = DITHER_VALUE(x);
- unsigned alpha = SkAlpha255To256(a);
- int d = SkAlphaMul(dither, alpha);
-
- unsigned sr = SkGetPackedR32(c);
- unsigned sg = SkGetPackedG32(c);
- unsigned sb = SkGetPackedB32(c);
- sr = SkDITHER_R32_FOR_565(sr, d);
- sg = SkDITHER_G32_FOR_565(sg, d);
- sb = SkDITHER_B32_FOR_565(sb, d);
-
- uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
- uint32_t dst_expanded = SkExpand_rgb_16(*dst);
- dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
- // now src and dst expanded are in g:11 r:10 x:1 b:10
- *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- }
- dst += 1;
- DITHER_INC_X(x);
- } while (--count != 0);
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha, int x, int y) {
- SkASSERT(255 == alpha);
-
-#define UNROLL 8
- if (count >= UNROLL) {
- uint8x8_t d;
- const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
- d = vld1_u8(dstart);
-
- while (count >= UNROLL) {
- uint8x8_t sr, sg, sb;
- uint16x8_t dr, dg, db;
- uint16x8_t dst8;
- uint8x8x4_t vsrc;
-
-#ifdef SK_CPU_ARM64
- vsrc = sk_vld4_u8_arm64_3(src);
-#else
- {
- register uint8x8_t d0 asm("d0");
- register uint8x8_t d1 asm("d1");
- register uint8x8_t d2 asm("d2");
- register uint8x8_t d3 asm("d3");
-
- asm (
- "vld4.8 {d0-d3},[%[src]]! "
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
- :
- );
- vsrc.val[0] = d0;
- vsrc.val[1] = d1;
- vsrc.val[2] = d2;
- }
-#endif
- sr = vsrc.val[NEON_R];
- sg = vsrc.val[NEON_G];
- sb = vsrc.val[NEON_B];
-
- /* XXX: if we want to prefetch, hide it in the above asm()
- * using the gcc __builtin_prefetch(), the prefetch will
- * fall to the bottom of the loop -- it won't stick up
- * at the top of the loop, just after the vld4.
- */
-
- // sr = sr - (sr>>5) + d
- sr = vsub_u8(sr, vshr_n_u8(sr, 5));
- dr = vaddl_u8(sr, d);
-
- // sb = sb - (sb>>5) + d
- sb = vsub_u8(sb, vshr_n_u8(sb, 5));
- db = vaddl_u8(sb, d);
-
- // sg = sg - (sg>>6) + d>>1; similar logic for overflows
- sg = vsub_u8(sg, vshr_n_u8(sg, 6));
- dg = vaddl_u8(sg, vshr_n_u8(d, 1));
-
- // pack high bits of each into 565 format (rgb, b is lsb)
- dst8 = vshrq_n_u16(db, 3);
- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
-
- // store it
- vst1q_u16(dst, dst8);
-
- dst += UNROLL;
- // we don't need to increment src as the asm above has already done it
- count -= UNROLL;
- x += UNROLL; // probably superfluous
- }
- }
-#undef UNROLL
-
- // residuals
- if (count > 0) {
- DITHER_565_SCAN(y);
- do {
- SkPMColor c = *src++;
- SkPMColorAssert(c);
- SkASSERT(SkGetPackedA32(c) == 255);
-
- unsigned dither = DITHER_VALUE(x);
- *dst++ = SkDitherRGB32To565(c, dither);
- DITHER_INC_X(x);
- } while (--count != 0);
- }
-}
-
///////////////////////////////////////////////////////////////////////////////
-const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
- // no dither
- S32_D565_Opaque_neon,
- S32_D565_Blend_neon,
- S32A_D565_Opaque_neon,
-#if 0
- S32A_D565_Blend_neon,
-#else
- nullptr, // https://code.google.com/p/skia/issues/detail?id=2797
-#endif
-
- // dither
- S32_D565_Opaque_Dither_neon,
- S32_D565_Blend_Dither_neon,
- S32A_D565_Opaque_Dither_neon,
- nullptr, // S32A_D565_Blend_Dither
-};
-
-const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
- Color32A_D565_neon, // Color32_D565,
- Color32A_D565_neon, // Color32A_D565,
- Color32A_D565_neon, // Color32_D565_Dither,
- Color32A_D565_neon, // Color32A_D565_Dither
-};
-
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_neon, // S32_Blend,
diff --git a/src/opts/SkBlitRow_opts_arm_neon.h b/src/opts/SkBlitRow_opts_arm_neon.h
index 159a466c98..815c2b7476 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.h
+++ b/src/opts/SkBlitRow_opts_arm_neon.h
@@ -9,8 +9,6 @@
#include "SkBlitRow.h"
-extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[];
-extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[];
extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[];
#endif
diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp
index a9abe06580..289bb7e88c 100644
--- a/src/opts/SkBlitRow_opts_none.cpp
+++ b/src/opts/SkBlitRow_opts_none.cpp
@@ -9,14 +9,6 @@
// Platform impl of Platform_procs with no overrides
-SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
- return nullptr;
-}
-
-SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
- return nullptr;
-}
-
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
return nullptr;
}
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 4b8c8a118b..7917259554 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -78,43 +78,6 @@ void SkBitmapProcState::platformProcs() {
////////////////////////////////////////////////////////////////////////////////
-static const SkBlitRow::Proc16 platform_16_procs[] = {
- S32_D565_Opaque_SSE2, // S32_D565_Opaque
- nullptr, // S32_D565_Blend
- S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
- nullptr, // S32A_D565_Blend
- S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither
- nullptr, // S32_D565_Blend_Dither
- S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither
- nullptr, // S32A_D565_Blend_Dither
-};
-
-SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
- if (SkCpu::Supports(SkCpu::SSE2)) {
- return platform_16_procs[flags];
- } else {
- return nullptr;
- }
-}
-
-static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = {
- Color32A_D565_SSE2, // Color32A_D565,
- nullptr, // Color32A_D565_Dither
-};
-
-SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
-/* If you're thinking about writing an SSE4 version of this, do check it's
- * actually faster on Atom. Our original SSE4 version was slower than this
- * SSE2 version on Silvermont, and only marginally faster on a Core i7,
- * mainly due to the MULLD timings.
- */
- if (SkCpu::Supports(SkCpu::SSE2)) {
- return platform_565_colorprocs_SSE2[flags];
- } else {
- return nullptr;
- }
-}
-
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,