diff options
-rw-r--r-- | src/core/SkBlitRow.h | 3 | ||||
-rw-r--r-- | src/core/SkBlitRow_D16.cpp | 12 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 547 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.h | 15 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 16 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.cpp | 1099 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm_neon.h | 2 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_none.cpp | 8 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 37 |
9 files changed, 2 insertions, 1737 deletions
diff --git a/src/core/SkBlitRow.h b/src/core/SkBlitRow.h index 56121eba78..35f0a0a48d 100644 --- a/src/core/SkBlitRow.h +++ b/src/core/SkBlitRow.h @@ -78,9 +78,6 @@ public: static Proc32 PlatformProcs32(unsigned flags); - static Proc16 PlatformFactory565(unsigned flags); - static ColorProc16 PlatformColorFactory565(unsigned flags); - private: enum { kFlags16_Mask = 7, diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp index 648e0ea8bb..3c5aba57ca 100644 --- a/src/core/SkBlitRow_D16.cpp +++ b/src/core/SkBlitRow_D16.cpp @@ -245,11 +245,7 @@ SkBlitRow::Proc16 SkBlitRow::Factory16(unsigned flags) { // just so we don't crash flags &= kFlags16_Mask; - SkBlitRow::Proc16 proc = PlatformFactory565(flags); - if (nullptr == proc) { - proc = gDefault_565_Procs[flags]; - } - return proc; + return gDefault_565_Procs[flags]; } static const SkBlitRow::ColorProc16 gDefault_565_ColorProcs[] = { @@ -273,9 +269,5 @@ SkBlitRow::ColorProc16 SkBlitRow::ColorFactory16(unsigned flags) { SkASSERT(flags < SK_ARRAY_COUNT(gDefault_565_ColorProcs)); - SkBlitRow::ColorProc16 proc = PlatformColorFactory565(flags); - if (nullptr == proc) { - proc = gDefault_565_ColorProcs[flags]; - } - return proc; + return gDefault_565_ColorProcs[flags]; } diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 7ce1fc9a80..7f03907d1c 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -103,75 +103,6 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, } } -void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { - SkASSERT(count > 0); - - uint32_t src_expand = (SkGetPackedG32(src) << 24) | - (SkGetPackedR32(src) << 13) | - (SkGetPackedB32(src) << 2); - unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; - - // Check if we have enough pixels to run SIMD - if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { - __m128i* dst_wide; - const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2); - const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3); - const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2); - const __m128i scale_wide = _mm_set1_epi16(scale); - const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK); - const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT); - - // Align dst to an even 16 byte address (0-7 pixels) - while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { - *dst = SkBlend32_RGB16(src_expand, *dst, scale); - dst += 1; - count--; - } - - dst_wide = reinterpret_cast<__m128i*>(dst); - do { - // Load eight RGB565 pixels - __m128i pixels = _mm_load_si128(dst_wide); - - // Mask out sub-pixels - __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT); - __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS); - pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS); - __m128i pixel_B = _mm_and_si128(pixels, mask_blue); - - // Scale with alpha - pixel_R = _mm_mullo_epi16(pixel_R, scale_wide); - pixel_G = _mm_mullo_epi16(pixel_G, scale_wide); - pixel_B = _mm_mullo_epi16(pixel_B, scale_wide); - - // Add src_X_wide and shift down again - pixel_R = _mm_add_epi16(pixel_R, src_R_wide); - pixel_R = _mm_srli_epi16(pixel_R, 5); - pixel_G = _mm_add_epi16(pixel_G, src_G_wide); - pixel_B = _mm_add_epi16(pixel_B, src_B_wide); - pixel_B = _mm_srli_epi16(pixel_B, 5); - - // Combine into RGB565 and store - pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT); - pixel_G = _mm_and_si128(pixel_G, mask_green); - pixels = _mm_or_si128(pixel_R, pixel_G); - pixels = _mm_or_si128(pixels, pixel_B); - _mm_store_si128(dst_wide, pixels); - count -= 8; - dst_wide++; - } while (count >= 8); - - dst = reinterpret_cast<uint16_t*>(dst_wide); - } - - // Small loop to handle remaining pixels. - while (count > 0) { - *dst = SkBlend32_RGB16(src_expand, *dst, scale); - dst += 1; - count--; - } -} - // The following (left) shifts cause the top 5 bits of the mask components to // line up with the corresponding components in an SkPMColor. // Note that the mask's RGB16 order may differ from the SkPMColor order. @@ -510,481 +441,3 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], width--; } } - -/* SSE2 version of S32_D565_Opaque() - * portable version is in core/SkBlitRow_D16.cpp - */ -void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - if (count <= 0) { - return; - } - - if (count >= 8) { - while (((size_t)dst & 0x0F) != 0) { - SkPMColor c = *src++; - SkPMColorAssert(c); - - *dst++ = SkPixel32ToPixel16_ToU16(c); - count--; - } - - const __m128i* s = reinterpret_cast<const __m128i*>(src); - __m128i* d = reinterpret_cast<__m128i*>(dst); - - while (count >= 8) { - // Load 8 pixels of src. - __m128i src_pixel1 = _mm_loadu_si128(s++); - __m128i src_pixel2 = _mm_loadu_si128(s++); - - __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2); - _mm_store_si128(d++, d_pixel); - count -= 8; - } - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<uint16_t*>(d); - } - - if (count > 0) { - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - *dst++ = SkPixel32ToPixel16_ToU16(c); - } while (--count != 0); - } -} - -/* SSE2 version of S32A_D565_Opaque() - * portable version is in core/SkBlitRow_D16.cpp - */ -void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - if (count <= 0) { - return; - } - - if (count >= 8) { - // Make dst 16 bytes alignment - while (((size_t)dst & 0x0F) != 0) { - SkPMColor c = *src++; - if (c) { - *dst = SkSrcOver32To16(c, *dst); - } - dst += 1; - count--; - } - - const __m128i* s = reinterpret_cast<const __m128i*>(src); - __m128i* d = reinterpret_cast<__m128i*>(dst); - __m128i var255 = _mm_set1_epi16(255); - __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); - __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); - __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); - - while (count >= 8) { - // Load 8 pixels of src. - __m128i src_pixel1 = _mm_loadu_si128(s++); - __m128i src_pixel2 = _mm_loadu_si128(s++); - - // Check whether src pixels are equal to 0 and get the highest bit - // of each byte of result, if src pixels are all zero, src_cmp1 and - // src_cmp2 will be 0xFFFF. - int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, - _mm_setzero_si128())); - int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, - _mm_setzero_si128())); - if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { - d++; - count -= 8; - continue; - } - - // Load 8 pixels of dst. - __m128i dst_pixel = _mm_load_si128(d); - - // Extract A from src. - __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); - sa1 = _mm_srli_epi32(sa1, 24); - __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); - sa2 = _mm_srli_epi32(sa2, 24); - __m128i sa = _mm_packs_epi32(sa1, sa2); - - // Extract R from src. - __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); - sr1 = _mm_srli_epi32(sr1, 24); - __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); - sr2 = _mm_srli_epi32(sr2, 24); - __m128i sr = _mm_packs_epi32(sr1, sr2); - - // Extract G from src. - __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); - sg1 = _mm_srli_epi32(sg1, 24); - __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); - sg2 = _mm_srli_epi32(sg2, 24); - __m128i sg = _mm_packs_epi32(sg1, sg2); - - // Extract B from src. - __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); - sb1 = _mm_srli_epi32(sb1, 24); - __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); - sb2 = _mm_srli_epi32(sb2, 24); - __m128i sb = _mm_packs_epi32(sb1, sb2); - - // Extract R G B from dst. - __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - dr = _mm_and_si128(dr, r16_mask); - __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dg = _mm_and_si128(dg, g16_mask); - __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); - db = _mm_and_si128(db, b16_mask); - - __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa - - // Calculate R G B of result. - // Original algorithm is in SkSrcOver32To16(). - dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); - dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); - dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); - dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); - db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); - db = _mm_srli_epi16(db, 8 - SK_B16_BITS); - - // Pack R G B into 16-bit color. - __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); - - // Store 8 16-bit colors in dst. - _mm_store_si128(d++, d_pixel); - count -= 8; - } - - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<uint16_t*>(d); - } - - if (count > 0) { - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - *dst = SkSrcOver32To16(c, *dst); - } - dst += 1; - } while (--count != 0); - } -} - -void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y) { - SkASSERT(255 == alpha); - - if (count <= 0) { - return; - } - - if (count >= 8) { - while (((size_t)dst & 0x0F) != 0) { - DITHER_565_SCAN(y); - SkPMColor c = *src++; - SkPMColorAssert(c); - - unsigned dither = DITHER_VALUE(x); - *dst++ = SkDitherRGB32To565(c, dither); - DITHER_INC_X(x); - count--; - } - - unsigned short dither_value[8]; - __m128i dither; -#ifdef ENABLE_DITHER_MATRIX_4X4 - const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; - dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; - dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; - dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; - dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; -#else - const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; - dither_value[0] = dither_value[4] = (dither_scan - >> (((x) & 3) << 2)) & 0xF; - dither_value[1] = dither_value[5] = (dither_scan - >> (((x + 1) & 3) << 2)) & 0xF; - dither_value[2] = dither_value[6] = (dither_scan - >> (((x + 2) & 3) << 2)) & 0xF; - dither_value[3] = dither_value[7] = (dither_scan - >> (((x + 3) & 3) << 2)) & 0xF; -#endif - dither = _mm_loadu_si128((__m128i*) dither_value); - - const __m128i* s = reinterpret_cast<const __m128i*>(src); - __m128i* d = reinterpret_cast<__m128i*>(dst); - - while (count >= 8) { - // Load 8 pixels of src. - __m128i src_pixel1 = _mm_loadu_si128(s++); - __m128i src_pixel2 = _mm_loadu_si128(s++); - - // Extract R from src. - __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); - sr1 = _mm_srli_epi32(sr1, 24); - __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); - sr2 = _mm_srli_epi32(sr2, 24); - __m128i sr = _mm_packs_epi32(sr1, sr2); - - // SkDITHER_R32To565(sr, dither) - __m128i sr_offset = _mm_srli_epi16(sr, 5); - sr = _mm_add_epi16(sr, dither); - sr = _mm_sub_epi16(sr, sr_offset); - sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); - - // Extract G from src. - __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); - sg1 = _mm_srli_epi32(sg1, 24); - __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); - sg2 = _mm_srli_epi32(sg2, 24); - __m128i sg = _mm_packs_epi32(sg1, sg2); - - // SkDITHER_R32To565(sg, dither) - __m128i sg_offset = _mm_srli_epi16(sg, 6); - sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); - sg = _mm_sub_epi16(sg, sg_offset); - sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); - - // Extract B from src. - __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); - sb1 = _mm_srli_epi32(sb1, 24); - __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); - sb2 = _mm_srli_epi32(sb2, 24); - __m128i sb = _mm_packs_epi32(sb1, sb2); - - // SkDITHER_R32To565(sb, dither) - __m128i sb_offset = _mm_srli_epi16(sb, 5); - sb = _mm_add_epi16(sb, dither); - sb = _mm_sub_epi16(sb, sb_offset); - sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); - - // Pack and store 16-bit dst pixel. - __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); - _mm_store_si128(d++, d_pixel); - - count -= 8; - x += 8; - } - - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<uint16_t*>(d); - } - - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - - unsigned dither = DITHER_VALUE(x); - *dst++ = SkDitherRGB32To565(c, dither); - DITHER_INC_X(x); - } while (--count != 0); - } -} - -/* SSE2 version of S32A_D565_Opaque_Dither() - * portable version is in core/SkBlitRow_D16.cpp - */ -void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y) { - SkASSERT(255 == alpha); - - if (count <= 0) { - return; - } - - if (count >= 8) { - while (((size_t)dst & 0x0F) != 0) { - DITHER_565_SCAN(y); - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - unsigned a = SkGetPackedA32(c); - - int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); - - unsigned sr = SkGetPackedR32(c); - unsigned sg = SkGetPackedG32(c); - unsigned sb = SkGetPackedB32(c); - sr = SkDITHER_R32_FOR_565(sr, d); - sg = SkDITHER_G32_FOR_565(sg, d); - sb = SkDITHER_B32_FOR_565(sb, d); - - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); - uint32_t dst_expanded = SkExpand_rgb_16(*dst); - dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); - // now src and dst expanded are in g:11 r:10 x:1 b:10 - *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - } - dst += 1; - DITHER_INC_X(x); - count--; - } - - unsigned short dither_value[8]; - __m128i dither, dither_cur; -#ifdef ENABLE_DITHER_MATRIX_4X4 - const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; - dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; - dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; - dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; - dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; -#else - const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; - dither_value[0] = dither_value[4] = (dither_scan - >> (((x) & 3) << 2)) & 0xF; - dither_value[1] = dither_value[5] = (dither_scan - >> (((x + 1) & 3) << 2)) & 0xF; - dither_value[2] = dither_value[6] = (dither_scan - >> (((x + 2) & 3) << 2)) & 0xF; - dither_value[3] = dither_value[7] = (dither_scan - >> (((x + 3) & 3) << 2)) & 0xF; -#endif - dither = _mm_loadu_si128((__m128i*) dither_value); - - const __m128i* s = reinterpret_cast<const __m128i*>(src); - __m128i* d = reinterpret_cast<__m128i*>(dst); - __m128i var256 = _mm_set1_epi16(256); - __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); - __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); - __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); - - while (count >= 8) { - // Load 8 pixels of src and dst. - __m128i src_pixel1 = _mm_loadu_si128(s++); - __m128i src_pixel2 = _mm_loadu_si128(s++); - __m128i dst_pixel = _mm_load_si128(d); - - // Extract A from src. - __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); - sa1 = _mm_srli_epi32(sa1, 24); - __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); - sa2 = _mm_srli_epi32(sa2, 24); - __m128i sa = _mm_packs_epi32(sa1, sa2); - - // Calculate current dither value. - dither_cur = _mm_mullo_epi16(dither, - _mm_add_epi16(sa, _mm_set1_epi16(1))); - dither_cur = _mm_srli_epi16(dither_cur, 8); - - // Extract R from src. - __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); - sr1 = _mm_srli_epi32(sr1, 24); - __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); - sr2 = _mm_srli_epi32(sr2, 24); - __m128i sr = _mm_packs_epi32(sr1, sr2); - - // SkDITHER_R32_FOR_565(sr, d) - __m128i sr_offset = _mm_srli_epi16(sr, 5); - sr = _mm_add_epi16(sr, dither_cur); - sr = _mm_sub_epi16(sr, sr_offset); - - // Expand sr. - sr = _mm_slli_epi16(sr, 2); - - // Extract G from src. - __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); - sg1 = _mm_srli_epi32(sg1, 24); - __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); - sg2 = _mm_srli_epi32(sg2, 24); - __m128i sg = _mm_packs_epi32(sg1, sg2); - - // sg = SkDITHER_G32_FOR_565(sg, d). - __m128i sg_offset = _mm_srli_epi16(sg, 6); - sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); - sg = _mm_sub_epi16(sg, sg_offset); - - // Expand sg. - sg = _mm_slli_epi16(sg, 3); - - // Extract B from src. - __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); - sb1 = _mm_srli_epi32(sb1, 24); - __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); - sb2 = _mm_srli_epi32(sb2, 24); - __m128i sb = _mm_packs_epi32(sb1, sb2); - - // sb = SkDITHER_B32_FOR_565(sb, d). - __m128i sb_offset = _mm_srli_epi16(sb, 5); - sb = _mm_add_epi16(sb, dither_cur); - sb = _mm_sub_epi16(sb, sb_offset); - - // Expand sb. - sb = _mm_slli_epi16(sb, 2); - - // Extract R G B from dst. - __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - dr = _mm_and_si128(dr, r16_mask); - __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dg = _mm_and_si128(dg, g16_mask); - __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); - db = _mm_and_si128(db, b16_mask); - - // SkAlpha255To256(255 - a) >> 3 - __m128i isa = _mm_sub_epi16(var256, sa); - isa = _mm_srli_epi16(isa, 3); - - dr = _mm_mullo_epi16(dr, isa); - dr = _mm_add_epi16(dr, sr); - dr = _mm_srli_epi16(dr, 5); - - dg = _mm_mullo_epi16(dg, isa); - dg = _mm_add_epi16(dg, sg); - dg = _mm_srli_epi16(dg, 5); - - db = _mm_mullo_epi16(db, isa); - db = _mm_add_epi16(db, sb); - db = _mm_srli_epi16(db, 5); - - // Package and store dst pixel. - __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); - _mm_store_si128(d++, d_pixel); - - count -= 8; - x += 8; - } - - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<uint16_t*>(d); - } - - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - unsigned a = SkGetPackedA32(c); - - int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); - - unsigned sr = SkGetPackedR32(c); - unsigned sg = SkGetPackedG32(c); - unsigned sb = SkGetPackedB32(c); - sr = SkDITHER_R32_FOR_565(sr, d); - sg = SkDITHER_G32_FOR_565(sg, d); - sb = SkDITHER_B32_FOR_565(sb, d); - - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); - uint32_t dst_expanded = SkExpand_rgb_16(*dst); - dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); - // now src and dst expanded are in g:11 r:10 x:1 b:10 - *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - } - dst += 1; - DITHER_INC_X(x); - } while (--count != 0); - } -} diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h index 652ff6ee09..cb93da6121 100644 --- a/src/opts/SkBlitRow_opts_SSE2.h +++ b/src/opts/SkBlitRow_opts_SSE2.h @@ -18,24 +18,9 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha); -void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, - int y); - void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor); void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst); -void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/); -void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int /*x*/, int /*y*/); -void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y); -void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y); #endif diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index d4b1d0dd88..543640a57f 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -10,26 +10,10 @@ #include "SkBlitRow_opts_arm_neon.h" -extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = { - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, -}; - -extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = { - nullptr, nullptr, -}; - extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { nullptr, nullptr, nullptr, nullptr, }; -SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { - return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags]; -} - -SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { - return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_colorprocs_arm)[flags]; -} - SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags]; } diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index b17cd19a33..7a9534ca84 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -17,860 +17,6 @@ #include "SkColor_opts_neon.h" #include <arm_neon.h> -#ifdef SK_CPU_ARM64 -static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { - uint8x8x4_t vsrc; - uint8x8_t vsrc_0, vsrc_1, vsrc_2; - - asm ( - "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" - "mov %[vsrc0].8b, v0.8b \t\n" - "mov %[vsrc1].8b, v1.8b \t\n" - "mov %[vsrc2].8b, v2.8b \t\n" - : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), - [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) - : : "v0", "v1", "v2", "v3" - ); - - vsrc.val[0] = vsrc_0; - vsrc.val[1] = vsrc_1; - vsrc.val[2] = vsrc_2; - - return vsrc; -} - -static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { - uint8x8x4_t vsrc; - uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; - - asm ( - "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" - "mov %[vsrc0].8b, v0.8b \t\n" - "mov %[vsrc1].8b, v1.8b \t\n" - "mov %[vsrc2].8b, v2.8b \t\n" - "mov %[vsrc3].8b, v3.8b \t\n" - : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), - [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), - [src] "+&r" (src) - : : "v0", "v1", "v2", "v3" - ); - - vsrc.val[0] = vsrc_0; - vsrc.val[1] = vsrc_1; - vsrc.val[2] = vsrc_2; - vsrc.val[3] = vsrc_3; - - return vsrc; -} -#endif - -void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - while (count >= 8) { - uint8x8x4_t vsrc; - uint16x8_t vdst; - - // Load -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_3(src); -#else - vsrc = vld4_u8((uint8_t*)src); - src += 8; -#endif - - // Convert src to 565 - vdst = SkPixel32ToPixel16_neon8(vsrc); - - // Store - vst1q_u16(dst, vdst); - - // Prepare next iteration - dst += 8; - count -= 8; - }; - - // Leftovers - while (count > 0) { - SkPMColor c = *src++; - SkPMColorAssert(c); - *dst = SkPixel32ToPixel16_ToU16(c); - dst++; - count--; - }; -} - -void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 > alpha); - - uint16x8_t vmask_blue, vscale; - - // prepare constants - vscale = vdupq_n_u16(SkAlpha255To256(alpha)); - vmask_blue = vmovq_n_u16(0x1F); - - while (count >= 8) { - uint8x8x4_t vsrc; - uint16x8_t vdst, vdst_r, vdst_g, vdst_b; - uint16x8_t vres_r, vres_g, vres_b; - - // Load src -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_3(src); -#else - { - register uint8x8_t d0 asm("d0"); - register uint8x8_t d1 asm("d1"); - register uint8x8_t d2 asm("d2"); - register uint8x8_t d3 asm("d3"); - - asm ( - "vld4.8 {d0-d3},[%[src]]!" - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) - : - ); - vsrc.val[0] = d0; - vsrc.val[1] = d1; - vsrc.val[2] = d2; - } -#endif - - // Load and unpack dst - vdst = vld1q_u16(dst); - vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes - vdst_b = vandq_u16(vdst, vmask_blue); // extract blue - vdst_r = vshrq_n_u16(vdst, 6+5); // extract red - vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green - - // Shift src to 565 range - vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); - vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); - vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); - - // Scale src - dst - vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; - vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; - vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; - - vres_r = vshrq_n_u16(vres_r * vscale, 8); - vres_g = vshrq_n_u16(vres_g * vscale, 8); - vres_b = vshrq_n_u16(vres_b * vscale, 8); - - vres_r += vdst_r; - vres_g += vdst_g; - vres_b += vdst_b; - - // Combine - vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue - vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue - - // Store - vst1q_u16(dst, vres_b); - dst += 8; - count -= 8; - } - if (count > 0) { - int scale = SkAlpha255To256(alpha); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - uint16_t d = *dst; - *dst++ = SkPackRGB16( - SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), - SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), - SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); - } while (--count != 0); - } -} - -#ifdef SK_CPU_ARM32 -void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - if (count >= 8) { - int32_t tmp = 0; - - asm volatile ( - "ands %[tmp], %[count], #7 \n\t" - "vmov.u8 d31, #1<<7 \n\t" - "vld1.16 {q12}, [%[dst]] \n\t" - "vld4.8 {d0-d3}, [%[src]] \n\t" - // Thumb does not support the standard ARM conditional - // instructions but instead requires the 'it' instruction - // to signal conditional execution - "it eq \n\t" - "moveq %[tmp], #8 \n\t" - "mov ip, %[dst] \n\t" - - "add %[src], %[src], %[tmp], LSL#2 \n\t" - "add %[dst], %[dst], %[tmp], LSL#1 \n\t" - "subs %[count], %[count], %[tmp] \n\t" - "b 9f \n\t" - // LOOP - "2: \n\t" - - "vld1.16 {q12}, [%[dst]]! \n\t" - "vld4.8 {d0-d3}, [%[src]]! \n\t" - "vst1.16 {q10}, [ip] \n\t" - "sub ip, %[dst], #8*2 \n\t" - "subs %[count], %[count], #8 \n\t" - "9: \n\t" - "pld [%[dst],#32] \n\t" - // expand 0565 q12 to 8888 {d4-d7} - "vmovn.u16 d4, q12 \n\t" - "vshr.u16 q11, q12, #5 \n\t" - "vshr.u16 q10, q12, #6+5 \n\t" - "vmovn.u16 d5, q11 \n\t" - "vmovn.u16 d6, q10 \n\t" - "vshl.u8 d4, d4, #3 \n\t" - "vshl.u8 d5, d5, #2 \n\t" - "vshl.u8 d6, d6, #3 \n\t" - - "vmovl.u8 q14, d31 \n\t" - "vmovl.u8 q13, d31 \n\t" - "vmovl.u8 q12, d31 \n\t" - - // duplicate in 4/2/1 & 8pix vsns - "vmvn.8 d30, d3 \n\t" - "vmlal.u8 q14, d30, d6 \n\t" - "vmlal.u8 q13, d30, d5 \n\t" - "vmlal.u8 q12, d30, d4 \n\t" - "vshr.u16 q8, q14, #5 \n\t" - "vshr.u16 q9, q13, #6 \n\t" - "vaddhn.u16 d6, q14, q8 \n\t" - "vshr.u16 q8, q12, #5 \n\t" - "vaddhn.u16 d5, q13, q9 \n\t" - "vaddhn.u16 d4, q12, q8 \n\t" - // intentionally don't calculate alpha - // result in d4-d6 - - #ifdef SK_PMCOLOR_IS_RGBA - "vqadd.u8 d6, d6, d0 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d2 \n\t" - #else - "vqadd.u8 d6, d6, d2 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d0 \n\t" - #endif - - // pack 8888 {d4-d6} to 0565 q10 - "vshll.u8 q10, d6, #8 \n\t" - "vshll.u8 q3, d5, #8 \n\t" - "vshll.u8 q2, d4, #8 \n\t" - "vsri.u16 q10, q3, #5 \n\t" - "vsri.u16 q10, q2, #11 \n\t" - - "bne 2b \n\t" - - "1: \n\t" - "vst1.16 {q10}, [ip] \n\t" - : [count] "+r" (count) - : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp) - : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", - "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", - "d30","d31" - ); - } - else - { // handle count < 8 - uint16_t* SK_RESTRICT keep_dst = 0; - - asm volatile ( - "vmov.u8 d31, #1<<7 \n\t" - "mov %[keep_dst], %[dst] \n\t" - - "tst %[count], #4 \n\t" - "beq 14f \n\t" - "vld1.16 {d25}, [%[dst]]! \n\t" - "vld1.32 {q1}, [%[src]]! \n\t" - - "14: \n\t" - "tst %[count], #2 \n\t" - "beq 12f \n\t" - "vld1.32 {d24[1]}, [%[dst]]! \n\t" - "vld1.32 {d1}, [%[src]]! \n\t" - - "12: \n\t" - "tst %[count], #1 \n\t" - "beq 11f \n\t" - "vld1.16 {d24[1]}, [%[dst]]! \n\t" - "vld1.32 {d0[1]}, [%[src]]! \n\t" - - "11: \n\t" - // unzips achieve the same as a vld4 operation - "vuzp.u16 q0, q1 \n\t" - "vuzp.u8 d0, d1 \n\t" - "vuzp.u8 d2, d3 \n\t" - // expand 0565 q12 to 8888 {d4-d7} - "vmovn.u16 d4, q12 \n\t" - "vshr.u16 q11, q12, #5 \n\t" - "vshr.u16 q10, q12, #6+5 \n\t" - "vmovn.u16 d5, q11 \n\t" - "vmovn.u16 d6, q10 \n\t" - "vshl.u8 d4, d4, #3 \n\t" - "vshl.u8 d5, d5, #2 \n\t" - "vshl.u8 d6, d6, #3 \n\t" - - "vmovl.u8 q14, d31 \n\t" - "vmovl.u8 q13, d31 \n\t" - "vmovl.u8 q12, d31 \n\t" - - // duplicate in 4/2/1 & 8pix vsns - "vmvn.8 d30, d3 \n\t" - "vmlal.u8 q14, d30, d6 \n\t" - "vmlal.u8 q13, d30, d5 \n\t" - "vmlal.u8 q12, d30, d4 \n\t" - "vshr.u16 q8, q14, #5 \n\t" - "vshr.u16 q9, q13, #6 \n\t" - "vaddhn.u16 d6, q14, q8 \n\t" - "vshr.u16 q8, q12, #5 \n\t" - "vaddhn.u16 d5, q13, q9 \n\t" - "vaddhn.u16 d4, q12, q8 \n\t" - // intentionally don't calculate alpha - // result in d4-d6 - - #ifdef SK_PMCOLOR_IS_RGBA - "vqadd.u8 d6, d6, d0 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d2 \n\t" - #else - "vqadd.u8 d6, d6, d2 \n\t" - "vqadd.u8 d5, d5, d1 \n\t" - "vqadd.u8 d4, d4, d0 \n\t" - #endif - - // pack 8888 {d4-d6} to 0565 q10 - "vshll.u8 q10, d6, #8 \n\t" - "vshll.u8 q3, d5, #8 \n\t" - "vshll.u8 q2, d4, #8 \n\t" - "vsri.u16 q10, q3, #5 \n\t" - "vsri.u16 q10, q2, #11 \n\t" - - // store - "tst %[count], #4 \n\t" - "beq 24f \n\t" - "vst1.16 {d21}, [%[keep_dst]]! \n\t" - - "24: \n\t" - "tst %[count], #2 \n\t" - "beq 22f \n\t" - "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" - - "22: \n\t" - "tst %[count], #1 \n\t" - "beq 21f \n\t" - "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" - - "21: \n\t" - : [count] "+r" (count) - : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) - : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", - "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", - "d30","d31" - ); - } -} - -#else // #ifdef SK_CPU_ARM32 - -void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 == alpha); - - if (count >= 16) { - asm ( - "movi v4.8h, #0x80 \t\n" - - "1: \t\n" - "sub %w[count], %w[count], #16 \t\n" - "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n" - "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n" - "prfm pldl1keep, [%[src],#512] \t\n" - "prfm pldl1keep, [%[dst],#256] \t\n" - "ushr v20.8h, v17.8h, #5 \t\n" - "ushr v31.8h, v16.8h, #5 \t\n" - "xtn v6.8b, v31.8h \t\n" - "xtn2 v6.16b, v20.8h \t\n" - "ushr v20.8h, v17.8h, #11 \t\n" - "shl v19.16b, v6.16b, #2 \t\n" - "ushr v31.8h, v16.8h, #11 \t\n" - "xtn v22.8b, v31.8h \t\n" - "xtn2 v22.16b, v20.8h \t\n" - "shl v18.16b, v22.16b, #3 \t\n" - "mvn v3.16b, v3.16b \t\n" - "xtn v16.8b, v16.8h \t\n" - "mov v7.16b, v4.16b \t\n" - "xtn2 v16.16b, v17.8h \t\n" - "umlal v7.8h, v3.8b, v19.8b \t\n" - "shl v16.16b, v16.16b, #3 \t\n" - "mov v22.16b, v4.16b \t\n" - "ushr v24.8h, v7.8h, #6 \t\n" - "umlal v22.8h, v3.8b, v18.8b \t\n" - "ushr v20.8h, v22.8h, #5 \t\n" - "addhn v20.8b, v22.8h, v20.8h \t\n" - "cmp %w[count], #16 \t\n" - "mov v6.16b, v4.16b \t\n" - "mov v5.16b, v4.16b \t\n" - "umlal v6.8h, v3.8b, v16.8b \t\n" - "umlal2 v5.8h, v3.16b, v19.16b \t\n" - "mov v17.16b, v4.16b \t\n" - "ushr v19.8h, v6.8h, #5 \t\n" - "umlal2 v17.8h, v3.16b, v18.16b \t\n" - "addhn v7.8b, v7.8h, v24.8h \t\n" - "ushr v18.8h, v5.8h, #6 \t\n" - "ushr v21.8h, v17.8h, #5 \t\n" - "addhn2 v7.16b, v5.8h, v18.8h \t\n" - "addhn2 v20.16b, v17.8h, v21.8h \t\n" - "mov v22.16b, v4.16b \t\n" - "addhn v6.8b, v6.8h, v19.8h \t\n" - "umlal2 v22.8h, v3.16b, v16.16b \t\n" - "ushr v5.8h, v22.8h, #5 \t\n" - "addhn2 v6.16b, v22.8h, v5.8h \t\n" - "uqadd v7.16b, v1.16b, v7.16b \t\n" -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) - "uqadd v20.16b, v2.16b, v20.16b \t\n" - "uqadd v6.16b, v0.16b, v6.16b \t\n" -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) - "uqadd v20.16b, v0.16b, v20.16b \t\n" - "uqadd v6.16b, v2.16b, v6.16b \t\n" -#else -#error "This function only supports BGRA and RGBA." -#endif - "shll v22.8h, v20.8b, #8 \t\n" - "shll v5.8h, v7.8b, #8 \t\n" - "sri v22.8h, v5.8h, #5 \t\n" - "shll v17.8h, v6.8b, #8 \t\n" - "shll2 v23.8h, v20.16b, #8 \t\n" - "shll2 v7.8h, v7.16b, #8 \t\n" - "sri v22.8h, v17.8h, #11 \t\n" - "sri v23.8h, v7.8h, #5 \t\n" - "shll2 v6.8h, v6.16b, #8 \t\n" - "st1 {v22.8h}, [%[dst]], #16 \t\n" - "sri v23.8h, v6.8h, #11 \t\n" - "st1 {v23.8h}, [%[dst]], #16 \t\n" - "b.ge 1b \t\n" - : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count) - :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v31" - ); - } - // Leftovers - if (count > 0) { - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - *dst = SkSrcOver32To16(c, *dst); - } - dst += 1; - } while (--count != 0); - } -} -#endif // #ifdef SK_CPU_ARM32 - -static uint32_t pmcolor_to_expand16(SkPMColor c) { - unsigned r = SkGetPackedR32(c); - unsigned g = SkGetPackedG32(c); - unsigned b = SkGetPackedB32(c); - return (g << 24) | (r << 13) | (b << 2); -} - -void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) { - uint32_t src_expand; - unsigned scale; - uint16x8_t vmask_blue; - - if (count <= 0) return; - SkASSERT(((size_t)dst & 0x01) == 0); - - /* - * This preamble code is in order to make dst aligned to 8 bytes - * in the next mutiple bytes read & write access. - */ - src_expand = pmcolor_to_expand16(src); - scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; - -#define DST_ALIGN 8 - - /* - * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time. - */ - int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); - - for (int i = 0; i < preamble_size; i+=2, dst++) { - uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; - *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); - if (--count == 0) - break; - } - - int count16 = 0; - count16 = count >> 4; - vmask_blue = vmovq_n_u16(SK_B16_MASK); - - if (count16) { - uint16x8_t wide_sr; - uint16x8_t wide_sg; - uint16x8_t wide_sb; - uint16x8_t wide_256_sa; - - unsigned sr = SkGetPackedR32(src); - unsigned sg = SkGetPackedG32(src); - unsigned sb = SkGetPackedB32(src); - unsigned sa = SkGetPackedA32(src); - - // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb - // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted, - //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) - wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift - - // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted, - //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5) - wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift - - // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted, - //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) - wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift - - wide_256_sa = - vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3 - - while (count16-- > 0) { - uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b; - uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b; - vdst1 = vld1q_u16(dst); - dst += 8; - vdst2 = vld1q_u16(dst); - dst -= 8; //to store dst again. - - vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes - vdst1_b = vdst1 & vmask_blue; // extract blue - vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red - vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green - - vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes - vdst2_b = vdst2 & vmask_blue; // extract blue - vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red - vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green - - vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1 - vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1 - vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1 - - vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2 - vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2 - vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2 - - vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red - vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green - vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue - - vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue - vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue - - vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red - vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green - vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue - - vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue - vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue - - vst1q_u16(dst, vdst1); - dst += 8; - vst1q_u16(dst, vdst2); - dst += 8; - } - } - - count &= 0xF; - if (count > 0) { - do { - uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; - *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); - dst += 1; - } while (--count != 0); - } -} - -static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { - prod += vdupq_n_u16(128); - prod += vshrq_n_u16(prod, 8); - return vshrq_n_u16(prod, 8); -} - -void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 > alpha); - - /* This code implements a Neon version of S32A_D565_Blend. The results have - * a few mismatches compared to the original code. These mismatches never - * exceed 1. - */ - - if (count >= 8) { - uint16x8_t valpha_max, vmask_blue; - uint8x8_t valpha; - - // prepare constants - valpha_max = vmovq_n_u16(255); - valpha = vdup_n_u8(alpha); - vmask_blue = vmovq_n_u16(SK_B16_MASK); - - do { - uint16x8_t vdst, vdst_r, vdst_g, vdst_b; - uint16x8_t vres_a, vres_r, vres_g, vres_b; - uint8x8x4_t vsrc; - - // load pixels - vdst = vld1q_u16(dst); -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_4(src); -#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) - asm ( - "vld4.u8 %h[vsrc], [%[src]]!" - : [vsrc] "=w" (vsrc), [src] "+&r" (src) - : : - ); -#else - register uint8x8_t d0 asm("d0"); - register uint8x8_t d1 asm("d1"); - register uint8x8_t d2 asm("d2"); - register uint8x8_t d3 asm("d3"); - - asm volatile ( - "vld4.u8 {d0-d3},[%[src]]!;" - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), - [src] "+&r" (src) - : : - ); - vsrc.val[0] = d0; - vsrc.val[1] = d1; - vsrc.val[2] = d2; - vsrc.val[3] = d3; -#endif - - - // deinterleave dst - vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes - vdst_b = vdst & vmask_blue; // extract blue - vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red - vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green - - // shift src to 565 - vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); - vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); - vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); - - // calc src * src_scale - vres_a = vmull_u8(vsrc.val[NEON_A], valpha); - vres_r = vmull_u8(vsrc.val[NEON_R], valpha); - vres_g = vmull_u8(vsrc.val[NEON_G], valpha); - vres_b = vmull_u8(vsrc.val[NEON_B], valpha); - - // prepare dst_scale - vres_a = SkDiv255Round_neon8(vres_a); - vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 - - // add dst * dst_scale to previous result - vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); - vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); - vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); - -#ifdef S32A_D565_BLEND_EXACT - // It is possible to get exact results with this but it is slow, - // even slower than C code in some cases - vres_r = SkDiv255Round_neon8(vres_r); - vres_g = SkDiv255Round_neon8(vres_g); - vres_b = SkDiv255Round_neon8(vres_b); -#else - vres_r = vrshrq_n_u16(vres_r, 8); - vres_g = vrshrq_n_u16(vres_g, 8); - vres_b = vrshrq_n_u16(vres_b, 8); -#endif - // pack result - vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue - vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue - - // store - vst1q_u16(dst, vres_b); - dst += 8; - count -= 8; - } while (count >= 8); - } - - // leftovers - while (count-- > 0) { - SkPMColor sc = *src++; - if (sc) { - uint16_t dc = *dst; - unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); - unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale); - unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale); - unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale); - *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); - } - dst += 1; - } -} - -/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. - * each dither value is spaced out into byte lanes, and repeated - * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the - * start of each row. - */ -static const uint8_t gDitherMatrix_Neon[48] = { - 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, - 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, - 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, - 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, - -}; - -void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, - int count, U8CPU alpha, int x, int y) -{ - - SkASSERT(255 > alpha); - - // rescale alpha to range 1 - 256 - int scale = SkAlpha255To256(alpha); - - if (count >= 8) { - /* select row and offset for dither array */ - const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; - - uint8x8_t vdither = vld1_u8(dstart); // load dither values - uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values - - int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg - uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask - - do { - - uint8x8x4_t vsrc; - uint8x8_t vsrc_r, vsrc_g, vsrc_b; - uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; - uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; - uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; - uint16x8_t vdst; - uint16x8_t vdst_r, vdst_g, vdst_b; - int16x8_t vres_r, vres_g, vres_b; - int8x8_t vres8_r, vres8_g, vres8_b; - - // Load source and add dither -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_3(src); -#else - { - register uint8x8_t d0 asm("d0"); - register uint8x8_t d1 asm("d1"); - register uint8x8_t d2 asm("d2"); - register uint8x8_t d3 asm("d3"); - - asm ( - "vld4.8 {d0-d3},[%[src]]! " - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) - : - ); - vsrc.val[0] = d0; - vsrc.val[1] = d1; - vsrc.val[2] = d2; - } -#endif - vsrc_r = vsrc.val[NEON_R]; - vsrc_g = vsrc.val[NEON_G]; - vsrc_b = vsrc.val[NEON_B]; - - vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 - vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 - vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 - - vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen - vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen - vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen - - vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result - vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result - vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result - - vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); - vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); - vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); - - // Load dst and unpack - vdst = vld1q_u16(dst); - vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green - vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red - vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue - - // subtract dst from src and widen - vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); - vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); - vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); - - // multiply diffs by scale and shift - vres_r = vmulq_s16(vres_r, vscale); - vres_g = vmulq_s16(vres_g, vscale); - vres_b = vmulq_s16(vres_b, vscale); - - vres8_r = vshrn_n_s16(vres_r, 8); - vres8_g = vshrn_n_s16(vres_g, 8); - vres8_b = vshrn_n_s16(vres_b, 8); - - // add dst to result - vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); - vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); - vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); - - // put result into 565 format - vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue - vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue - - // Store result - vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); - - // Next iteration - dst += 8; - count -= 8; - - } while (count >= 8); - } - - // Leftovers - if (count > 0) { - int scale = SkAlpha255To256(alpha); - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - - int dither = DITHER_VALUE(x); - int sr = SkGetPackedR32(c); - int sg = SkGetPackedG32(c); - int sb = SkGetPackedB32(c); - sr = SkDITHER_R32To565(sr, dither); - sg = SkDITHER_G32To565(sg, dither); - sb = SkDITHER_B32To565(sb, dither); - - uint16_t d = *dst; - *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), - SkAlphaBlend(sg, SkGetPackedG16(d), scale), - SkAlphaBlend(sb, SkGetPackedB16(d), scale)); - DITHER_INC_X(x); - } while (--count != 0); - } -} - /* Neon version of S32_Blend_BlitRow32() * portable version is in src/core/SkBlitRow_D32.cpp */ @@ -1042,253 +188,8 @@ void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, #endif // #ifdef SK_CPU_ARM32 -void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y) { - SkASSERT(255 == alpha); - -#define UNROLL 8 - - if (count >= UNROLL) { - - uint8x8_t dbase; - const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; - dbase = vld1_u8(dstart); - - do { - uint8x8x4_t vsrc; - uint8x8_t sr, sg, sb, sa, d; - uint16x8_t dst8, scale8, alpha8; - uint16x8_t dst_r, dst_g, dst_b; - -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_4(src); -#else - { - register uint8x8_t d0 asm("d0"); - register uint8x8_t d1 asm("d1"); - register uint8x8_t d2 asm("d2"); - register uint8x8_t d3 asm("d3"); - - asm ("vld4.8 {d0-d3},[%[src]]! " - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) - : - ); - vsrc.val[0] = d0; - vsrc.val[1] = d1; - vsrc.val[2] = d2; - vsrc.val[3] = d3; - } -#endif - sa = vsrc.val[NEON_A]; - sr = vsrc.val[NEON_R]; - sg = vsrc.val[NEON_G]; - sb = vsrc.val[NEON_B]; - - /* calculate 'd', which will be 0..7 - * dbase[] is 0..7; alpha is 0..256; 16 bits suffice - */ - alpha8 = vmovl_u8(dbase); - alpha8 = vmlal_u8(alpha8, sa, dbase); - d = vshrn_n_u16(alpha8, 8); // narrowing too - - // sr = sr - (sr>>5) + d - /* watching for 8-bit overflow. d is 0..7; risky range of - * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; - * safe as long as we do ((sr-sr>>5) + d) - */ - sr = vsub_u8(sr, vshr_n_u8(sr, 5)); - sr = vadd_u8(sr, d); - - // sb = sb - (sb>>5) + d - sb = vsub_u8(sb, vshr_n_u8(sb, 5)); - sb = vadd_u8(sb, d); - - // sg = sg - (sg>>6) + d>>1; similar logic for overflows - sg = vsub_u8(sg, vshr_n_u8(sg, 6)); - sg = vadd_u8(sg, vshr_n_u8(d,1)); - - // need to pick up 8 dst's -- at 16 bits each, 128 bits - dst8 = vld1q_u16(dst); - dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); - dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); - dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits - - // blend - scale8 = vsubw_u8(vdupq_n_u16(256), sa); - - // combine the addq and mul, save 3 insns - scale8 = vshrq_n_u16(scale8, 3); - dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); - dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); - dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); - - // repack to store - dst8 = vshrq_n_u16(dst_b, 5); - dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); - dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); - - vst1q_u16(dst, dst8); - - dst += UNROLL; - count -= UNROLL; - // skip x += UNROLL, since it's unchanged mod-4 - } while (count >= UNROLL); - } -#undef UNROLL - - // residuals - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - unsigned a = SkGetPackedA32(c); - - // dither and alpha are just temporary variables to work-around - // an ICE in debug. - unsigned dither = DITHER_VALUE(x); - unsigned alpha = SkAlpha255To256(a); - int d = SkAlphaMul(dither, alpha); - - unsigned sr = SkGetPackedR32(c); - unsigned sg = SkGetPackedG32(c); - unsigned sb = SkGetPackedB32(c); - sr = SkDITHER_R32_FOR_565(sr, d); - sg = SkDITHER_G32_FOR_565(sg, d); - sb = SkDITHER_B32_FOR_565(sb, d); - - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); - uint32_t dst_expanded = SkExpand_rgb_16(*dst); - dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); - // now src and dst expanded are in g:11 r:10 x:1 b:10 - *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - } - dst += 1; - DITHER_INC_X(x); - } while (--count != 0); - } -} - -/////////////////////////////////////////////////////////////////////////////// - -void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, - int count, U8CPU alpha, int x, int y) { - SkASSERT(255 == alpha); - -#define UNROLL 8 - if (count >= UNROLL) { - uint8x8_t d; - const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; - d = vld1_u8(dstart); - - while (count >= UNROLL) { - uint8x8_t sr, sg, sb; - uint16x8_t dr, dg, db; - uint16x8_t dst8; - uint8x8x4_t vsrc; - -#ifdef SK_CPU_ARM64 - vsrc = sk_vld4_u8_arm64_3(src); -#else - { - register uint8x8_t d0 asm("d0"); - register uint8x8_t d1 asm("d1"); - register uint8x8_t d2 asm("d2"); - register uint8x8_t d3 asm("d3"); - - asm ( - "vld4.8 {d0-d3},[%[src]]! " - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) - : - ); - vsrc.val[0] = d0; - vsrc.val[1] = d1; - vsrc.val[2] = d2; - } -#endif - sr = vsrc.val[NEON_R]; - sg = vsrc.val[NEON_G]; - sb = vsrc.val[NEON_B]; - - /* XXX: if we want to prefetch, hide it in the above asm() - * using the gcc __builtin_prefetch(), the prefetch will - * fall to the bottom of the loop -- it won't stick up - * at the top of the loop, just after the vld4. - */ - - // sr = sr - (sr>>5) + d - sr = vsub_u8(sr, vshr_n_u8(sr, 5)); - dr = vaddl_u8(sr, d); - - // sb = sb - (sb>>5) + d - sb = vsub_u8(sb, vshr_n_u8(sb, 5)); - db = vaddl_u8(sb, d); - - // sg = sg - (sg>>6) + d>>1; similar logic for overflows - sg = vsub_u8(sg, vshr_n_u8(sg, 6)); - dg = vaddl_u8(sg, vshr_n_u8(d, 1)); - - // pack high bits of each into 565 format (rgb, b is lsb) - dst8 = vshrq_n_u16(db, 3); - dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); - dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); - - // store it - vst1q_u16(dst, dst8); - - dst += UNROLL; - // we don't need to increment src as the asm above has already done it - count -= UNROLL; - x += UNROLL; // probably superfluous - } - } -#undef UNROLL - - // residuals - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - SkASSERT(SkGetPackedA32(c) == 255); - - unsigned dither = DITHER_VALUE(x); - *dst++ = SkDitherRGB32To565(c, dither); - DITHER_INC_X(x); - } while (--count != 0); - } -} - /////////////////////////////////////////////////////////////////////////////// -const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = { - // no dither - S32_D565_Opaque_neon, - S32_D565_Blend_neon, - S32A_D565_Opaque_neon, -#if 0 - S32A_D565_Blend_neon, -#else - nullptr, // https://code.google.com/p/skia/issues/detail?id=2797 -#endif - - // dither - S32_D565_Opaque_Dither_neon, - S32_D565_Blend_Dither_neon, - S32A_D565_Opaque_Dither_neon, - nullptr, // S32A_D565_Blend_Dither -}; - -const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { - Color32A_D565_neon, // Color32_D565, - Color32A_D565_neon, // Color32A_D565, - Color32A_D565_neon, // Color32_D565_Dither, - Color32A_D565_neon, // Color32A_D565_Dither -}; - const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { nullptr, // S32_Opaque, S32_Blend_BlitRow32_neon, // S32_Blend, diff --git a/src/opts/SkBlitRow_opts_arm_neon.h b/src/opts/SkBlitRow_opts_arm_neon.h index 159a466c98..815c2b7476 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.h +++ b/src/opts/SkBlitRow_opts_arm_neon.h @@ -9,8 +9,6 @@ #include "SkBlitRow.h" -extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[]; -extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[]; extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[]; #endif diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp index a9abe06580..289bb7e88c 100644 --- a/src/opts/SkBlitRow_opts_none.cpp +++ b/src/opts/SkBlitRow_opts_none.cpp @@ -9,14 +9,6 @@ // Platform impl of Platform_procs with no overrides -SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { - return nullptr; -} - -SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { - return nullptr; -} - SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { return nullptr; } diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 4b8c8a118b..7917259554 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -78,43 +78,6 @@ void SkBitmapProcState::platformProcs() { //////////////////////////////////////////////////////////////////////////////// -static const SkBlitRow::Proc16 platform_16_procs[] = { - S32_D565_Opaque_SSE2, // S32_D565_Opaque - nullptr, // S32_D565_Blend - S32A_D565_Opaque_SSE2, // S32A_D565_Opaque - nullptr, // S32A_D565_Blend - S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither - nullptr, // S32_D565_Blend_Dither - S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither - nullptr, // S32A_D565_Blend_Dither -}; - -SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { - if (SkCpu::Supports(SkCpu::SSE2)) { - return platform_16_procs[flags]; - } else { - return nullptr; - } -} - -static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = { - Color32A_D565_SSE2, // Color32A_D565, - nullptr, // Color32A_D565_Dither -}; - -SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { -/* If you're thinking about writing an SSE4 version of this, do check it's - * actually faster on Atom. Our original SSE4 version was slower than this - * SSE2 version on Silvermont, and only marginally faster on a Core i7, - * mainly due to the MULLD timings. - */ - if (SkCpu::Supports(SkCpu::SSE2)) { - return platform_565_colorprocs_SSE2[flags]; - } else { - return nullptr; - } -} - static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = { nullptr, // S32_Opaque, S32_Blend_BlitRow32_SSE2, // S32_Blend, |