From d6770e69e05c9dcc12f2a1a2d509c0b174372ee7 Mon Sep 17 00:00:00 2001 From: "tomhudson@google.com" Date: Tue, 14 Feb 2012 16:01:15 +0000 Subject: SSE2 version of blit_lcd16, courtesy of Jin Yang. Yields 25-30% speedup on Windows (32b), 4-7% on Linux (64b, less register pressure), not invoked on Mac (lcd text is 32b instead of 16b). Followup: GDI system settings on Windows can suppress LCD text for small fonts, interfering with our benchmarks. (http://code.google.com/p/skia/issues/detail?id=483) http://codereview.appspot.com/5617058/ git-svn-id: http://skia.googlecode.com/svn/trunk@3189 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/core/SkBlitMask.h | 20 ++++ src/core/SkBlitMask_D32.cpp | 129 ++++------------------ src/opts/SkBlitRow_opts_SSE2.cpp | 226 ++++++++++++++++++++++++++++++++++++++- src/opts/SkBlitRow_opts_SSE2.h | 5 + src/opts/SkBlitRow_opts_arm.cpp | 4 + src/opts/SkBlitRow_opts_none.cpp | 6 +- src/opts/opts_check_SSE2.cpp | 12 +++ 7 files changed, 290 insertions(+), 112 deletions(-) (limited to 'src') diff --git a/src/core/SkBlitMask.h b/src/core/SkBlitMask.h index 299f6d1e5c..9c0fe0f7c3 100644 --- a/src/core/SkBlitMask.h +++ b/src/core/SkBlitMask.h @@ -29,6 +29,15 @@ public: typedef void (*ColorProc)(void* dst, size_t dstRB, const void* mask, size_t maskRB, SkColor color, int width, int height); + + /** + * Function pointer that blits a row of mask(lcd16) into a row of dst + * colorized by a single color. The number of pixels to blit is specified + * by width. + */ + typedef void (*BlitLCD16RowProc)(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, + SkPMColor opaqueDst); /** * Function pointer that blits a row of src colors through a row of a mask @@ -49,6 +58,17 @@ public: * or NULL if no optimized routine is available. */ static ColorProc PlatformColorProcs(SkBitmap::Config, SkMask::Format, SkColor); + + /** + * Public entry-point to return a blitcolor BlitLCD16RowProc. + */ + static BlitLCD16RowProc BlitLCD16RowFactory(bool isOpaque); + + /** + * Return either platform specific optimized blitcolor BlitLCD16RowProc, + * or NULL if no optimized routine is available. + */ + static BlitLCD16RowProc PlatformBlitRowProcs16(bool isOpaque); enum RowFlags { kSrcIsOpaque_RowFlag = 1 << 0 diff --git a/src/core/SkBlitMask_D32.cpp b/src/core/SkBlitMask_D32.cpp index 341627aaea..c97e9e6747 100644 --- a/src/core/SkBlitMask_D32.cpp +++ b/src/core/SkBlitMask_D32.cpp @@ -64,106 +64,16 @@ static void D32_A8_Black(void* SK_RESTRICT dst, size_t dstRB, } while (--height != 0); } -/////////////////////////////////////////////////////////////////////////////// - -static inline int upscale31To32(int value) { - SkASSERT((unsigned)value <= 31); - return value + (value >> 4); -} - -static inline int blend32(int src, int dst, int scale) { - SkASSERT((unsigned)src <= 0xFF); - SkASSERT((unsigned)dst <= 0xFF); - SkASSERT((unsigned)scale <= 32); - return dst + ((src - dst) * scale >> 5); -} - -static void blit_lcd16_row(SkPMColor dst[], const uint16_t src[], - SkColor color, int width, SkPMColor) { - int srcA = SkColorGetA(color); - int srcR = SkColorGetR(color); - int srcG = SkColorGetG(color); - int srcB = SkColorGetB(color); - - srcA = SkAlpha255To256(srcA); - - for (int i = 0; i < width; i++) { - uint16_t mask = src[i]; - if (0 == mask) { - continue; - } - - SkPMColor d = dst[i]; - - /* We want all of these in 5bits, hence the shifts in case one of them - * (green) is 6bits. - */ - int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); - int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); - int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5); - - // Now upscale them to 0..32, so we can use blend32 - maskR = upscale31To32(maskR); - maskG = upscale31To32(maskG); - maskB = upscale31To32(maskB); - - maskR = maskR * srcA >> 8; - maskG = maskG * srcA >> 8; - maskB = maskB * srcA >> 8; - - int dstR = SkGetPackedR32(d); - int dstG = SkGetPackedG32(d); - int dstB = SkGetPackedB32(d); - - // LCD blitting is only supported if the dst is known/required - // to be opaque - dst[i] = SkPackARGB32(0xFF, - blend32(srcR, dstR, maskR), - blend32(srcG, dstG, maskG), - blend32(srcB, dstB, maskB)); +SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) { + BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque); + if (proc) { + return proc; } -} - -static void blit_lcd16_opaque_row(SkPMColor dst[], const uint16_t src[], - SkColor color, int width, SkPMColor opaqueDst) { - int srcR = SkColorGetR(color); - int srcG = SkColorGetG(color); - int srcB = SkColorGetB(color); - for (int i = 0; i < width; i++) { - uint16_t mask = src[i]; - if (0 == mask) { - continue; - } - if (0xFFFF == mask) { - dst[i] = opaqueDst; - continue; - } - - SkPMColor d = dst[i]; - - /* We want all of these in 5bits, hence the shifts in case one of them - * (green) is 6bits. - */ - int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5); - int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5); - int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5); - - // Now upscale them to 0..32, so we can use blend32 - maskR = upscale31To32(maskR); - maskG = upscale31To32(maskG); - maskB = upscale31To32(maskB); - - int dstR = SkGetPackedR32(d); - int dstG = SkGetPackedG32(d); - int dstB = SkGetPackedB32(d); - - // LCD blitting is only supported if the dst is known/required - // to be opaque - dst[i] = SkPackARGB32(0xFF, - blend32(srcR, dstR, maskR), - blend32(srcG, dstG, maskG), - blend32(srcB, dstB, maskB)); + if (isOpaque) { + return SkBlitLCD16OpaqueRow; + } else { + return SkBlitLCD16Row; } } @@ -175,13 +85,14 @@ static void D32_LCD16_Proc(void* SK_RESTRICT dst, size_t dstRB, const uint16_t* srcRow = (const uint16_t*)mask; SkPMColor opaqueDst; - void (*proc)(SkPMColor dst[], const uint16_t src[], - SkColor color, int width, SkPMColor); - if (0xFF == SkColorGetA(color)) { - proc = blit_lcd16_opaque_row; + SkBlitMask::BlitLCD16RowProc proc = NULL; + bool isOpaque = (0xFF == SkColorGetA(color)); + proc = SkBlitMask::BlitLCD16RowFactory(isOpaque); + SkASSERT(proc != NULL); + + if (isOpaque) { opaqueDst = SkPreMultiplyColor(color); } else { - proc = blit_lcd16_row; opaqueDst = 0; // ignored } @@ -546,9 +457,9 @@ static void LCD16_RowProc_Opaque(SkPMColor* SK_RESTRICT dst, int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5); // Now upscale them to 0..32, so we can use blend32 - maskR = upscale31To32(maskR); - maskG = upscale31To32(maskG); - maskB = upscale31To32(maskB); + maskR = SkUpscale31To32(maskR); + maskG = SkUpscale31To32(maskG); + maskB = SkUpscale31To32(maskB); int dstR = SkGetPackedR32(d); int dstG = SkGetPackedG32(d); @@ -557,9 +468,9 @@ static void LCD16_RowProc_Opaque(SkPMColor* SK_RESTRICT dst, // LCD blitting is only supported if the dst is known/required // to be opaque dst[i] = SkPackARGB32(0xFF, - blend32(srcR, dstR, maskR), - blend32(srcG, dstG, maskG), - blend32(srcB, dstB, maskB)); + SkBlend32(srcR, dstR, maskR), + SkBlend32(srcG, dstG, maskG), + SkBlend32(srcB, dstB, maskB)); } } diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index f03468f830..066f709e0b 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -386,8 +386,7 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, size_t maskRB, SkColor origColor, - int width, int height) -{ + int width, int height) { SkPMColor color = SkPreMultiplyColor(origColor); size_t dstOffset = dstRB - (width << 2); size_t maskOffset = maskRB - width; @@ -482,3 +481,226 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, mask += maskOffset; } while (--height != 0); } + +static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, + __m128i &mask, __m128i &scale) { + // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. + __m128i r = _mm_and_si128(_mm_slli_epi32(mask, + 16-SK_R16_SHIFT-(SK_R16_BITS-5)), + _mm_set1_epi32(0x001F0000)); + + __m128i g = _mm_and_si128(_mm_slli_epi32(mask, + 8-SK_G16_SHIFT-(SK_G16_BITS-5)), + _mm_set1_epi32(0x00001F00)); + + __m128i b = _mm_and_si128(_mm_slli_epi32(mask, + SK_B16_BITS-5), + _mm_set1_epi32(0x0000001F)); + + // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) + mask = _mm_or_si128(_mm_or_si128(r, g), b); + + // Interleave R,G,B into the lower byte of word. + __m128i maskLo, maskHi; + maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); + maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); + + // Upscale to 0..32 + maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); + maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); + + maskLo = _mm_mullo_epi16(maskLo, scale); + maskHi = _mm_mullo_epi16(maskHi, scale); + + maskLo = _mm_srli_epi16(maskLo, 8); + maskHi = _mm_srli_epi16(maskHi, 8); + + // Interleave R,G,B into the lower byte of the word. + __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); + + maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); + maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); + + maskLo = _mm_srai_epi16(maskLo, 5); + maskHi = _mm_srai_epi16(maskHi, 5); + + // Add two pixels into result. + __m128i resultLo = _mm_add_epi16(dstLo, maskLo); + __m128i resultHi = _mm_add_epi16(dstHi, maskHi); + + // Pack into 4 32bit dst pixels + return _mm_packus_epi16(resultLo, resultHi); +} + +static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, + __m128i &mask) { + // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. + __m128i r = _mm_and_si128(_mm_slli_epi32(mask, + 16-SK_R16_SHIFT-(SK_R16_BITS-5)), + _mm_set1_epi32(0x001F0000)); + + __m128i g = _mm_and_si128(_mm_slli_epi32(mask, + 8-SK_G16_SHIFT-(SK_G16_BITS-5)), + _mm_set1_epi32(0x00001F00)); + + __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5), + _mm_set1_epi32(0x0000001F)); + + // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) + mask = _mm_or_si128(_mm_or_si128(r, g), b); + + // Interleave R,G,B into the lower byte of word. + __m128i maskLo, maskHi; + maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); + maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); + + // Upscale to 0..32 + maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); + maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); + + // Interleave R,G,B into the lower byte of the word. + __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); + + maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); + maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); + + maskLo = _mm_srai_epi16(maskLo, 5); + maskHi = _mm_srai_epi16(maskHi, 5); + + // Add two pixels into result. + __m128i resultLo = _mm_add_epi16(dstLo, maskLo); + __m128i resultHi = _mm_add_epi16(dstHi, maskHi); + + // Pack into 4 32bit dst pixels + return _mm_packus_epi16(resultLo, resultHi); +} + +void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor) { + if (width <= 0) { + return; + } + + int srcA = SkColorGetA(color); + int srcR = SkColorGetR(color); + int srcG = SkColorGetG(color); + int srcB = SkColorGetB(color); + + srcA = SkAlpha255To256(srcA); + + if (width >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); + src++; + dst++; + width--; + } + + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); + srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); + __m128i scale = _mm_set1_epi16(srcA); + while (width >= 4) { + __m128i dst_pixel = _mm_load_si128(d); + __m128i mask_pixel = _mm_loadl_epi64( + reinterpret_cast(src)); + + // Check whether mask_pixels are equal to 0 and get the highest bit + // of each byte of result, if mask pixes are all zero, we will get + // pack_cmp to 0xFFFF + int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, + _mm_setzero_si128())); + + // if mask pixels are not all zero, we will blend the dst pixels + if (pack_cmp != 0xFFFF) { + // Unpack 4 16bit mask pixels to + // (p0, 0, p1, 0, p2, 0, p3, 0) + mask_pixel = _mm_unpacklo_epi16(mask_pixel, + _mm_setzero_si128()); + + // Process 4 32bit dst pixels + __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, + mask_pixel, scale); + _mm_store_si128(d, result); + } + + d++; + src += 4; + width -= 4; + } + + dst = reinterpret_cast(d); + } + + while (width > 0) { + *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); + src++; + dst++; + width--; + } +} + +void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor opaqueDst) { + if (width <= 0) { + return; + } + + int srcR = SkColorGetR(color); + int srcG = SkColorGetG(color); + int srcB = SkColorGetB(color); + + if (width >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); + src++; + dst++; + width--; + } + + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); + srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); + while (width >= 4) { + __m128i dst_pixel = _mm_load_si128(d); + __m128i mask_pixel = _mm_loadl_epi64( + reinterpret_cast(src)); + + // Check whether mask_pixels are equal to 0 and get the highest bit + // of each byte of result, if mask pixes are all zero, we will get + // pack_cmp to 0xFFFF + int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, + _mm_setzero_si128())); + + // if mask pixels are not all zero, we will blend the dst pixels + if (pack_cmp != 0xFFFF) { + // Unpack 4 16bit mask pixels to + // (p0, 0, p1, 0, p2, 0, p3, 0) + mask_pixel = _mm_unpacklo_epi16(mask_pixel, + _mm_setzero_si128()); + + // Process 4 32bit dst pixels + __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, + mask_pixel); + _mm_store_si128(d, result); + } + + d++; + src += 4; + width -= 4; + } + + dst = reinterpret_cast(d); + } + + while (width > 0) { + *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); + src++; + dst++; + width--; + } +} diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h index 8493e7a291..b443ec7f21 100644 --- a/src/opts/SkBlitRow_opts_SSE2.h +++ b/src/opts/SkBlitRow_opts_SSE2.h @@ -23,3 +23,8 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask, size_t maskRB, SkColor color, int width, int height); + +void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor); +void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor opaqueDst); diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index abf48626d4..761bf7437b 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -1313,6 +1313,10 @@ SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, return NULL; } +SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { + return NULL; +} + SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, SkMask::Format maskFormat, RowFlags flags) { diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp index d58d2ea720..5f4598e7ae 100644 --- a/src/opts/SkBlitRow_opts_none.cpp +++ b/src/opts/SkBlitRow_opts_none.cpp @@ -31,7 +31,11 @@ SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, SkMask::Format maskFormat, SkColor color) { - return NULL; + return NULL; +} + +SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { + return NULL; } SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index 00497c9c77..157d8cc448 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -126,6 +126,18 @@ SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, return proc; } +SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { + if (cachedHasSSE2()) { + if (isOpaque) { + return SkBlitLCD16OpaqueRow_SSE2; + } else { + return SkBlitLCD16Row_SSE2; + } + } else { + return NULL; + } + +} SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, SkMask::Format maskFormat, RowFlags flags) { -- cgit v1.2.3