diff options
author | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2009-11-30 20:00:29 +0000 |
---|---|---|
committer | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2009-11-30 20:00:29 +0000 |
commit | dc7de745dd142cdc00ffed7963ebb030a0506f72 (patch) | |
tree | 2e83ab31d25f3aa70f905aaef3f97cbe5d52055b /src/opts/SkBlitRow_opts_SSE2.cpp | |
parent | a3d901099d7d295cd7d9df4114e874d9ccfff447 (diff) |
More SSE2 optimizations. This CL implements an SSE2 version of S32_bitmap_D32_filter_DX, and uses aligned loads and stores for dst, in all blending.
Review URL: http://codereview.appspot.com/157141
git-svn-id: http://skia.googlecode.com/svn/trunk@448 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/opts/SkBlitRow_opts_SSE2.cpp')
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 429 |
1 files changed, 230 insertions, 199 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 8983093ee6..bf2db26d79 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -34,50 +34,60 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, uint32_t src_scale = SkAlpha255To256(alpha); uint32_t dst_scale = 256 - src_scale; - const __m128i *s = reinterpret_cast<const __m128i*>(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i src_scale_wide = _mm_set1_epi16(src_scale); - __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Multiply by scale. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); - dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); - - // Divide by 256. - src_rb = _mm_srli_epi16(src_rb, 8); - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - src_pixel = _mm_or_si128(src_rb, src_ag); - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; + if (count >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); + src++; + dst++; + count--; + } + + const __m128i *s = reinterpret_cast<const __m128i*>(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Multiply by scale. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); + dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); + + // Divide by 256. + src_rb = _mm_srli_epi16(src_rb, 8); + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + src_pixel = _mm_or_si128(src_rb, src_ag); + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<SkPMColor*>(d); } - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<SkPMColor*>(d); while (count > 0) { *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); src++; @@ -93,103 +103,114 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, if (count <= 0) { return; } - const __m128i *s = reinterpret_cast<const __m128i*>(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); + + if (count >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkPMSrcOver(*src, *dst); + src++; + dst++; + count--; + } + + const __m128i *s = reinterpret_cast<const __m128i*>(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); #ifdef SK_USE_ACCURATE_BLENDING - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) - __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) - while (count >= 4) { - // Load 4 pixels - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); - dst_ag = _mm_srli_epi16(dst_ag, 8); - // Shift alphas down to lower 8 bits of each quad. - __m128i alpha = _mm_srli_epi32(src_pixel, 24); - - // Copy alpha to upper 3rd byte of each quad - alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); - - // Subtract alphas from 255, to get 0..255 - alpha = _mm_sub_epi16(c_255, alpha); - - // Multiply by red and blue by src alpha. - dst_rb = _mm_mullo_epi16(dst_rb, alpha); - // Multiply by alpha and green by src alpha. - dst_ag = _mm_mullo_epi16(dst_ag, alpha); - - // dst_rb_low = (dst_rb >> 8) - __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); - __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); - - // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 - dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); - dst_rb = _mm_add_epi16(dst_rb, c_128); - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask - dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); - dst_ag = _mm_add_epi16(dst_ag, c_128); - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; - } -#else - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) - while (count >= 4) { - // Load 4 pixels - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); - dst_ag = _mm_srli_epi16(dst_ag, 8); - // Shift alphas down to lower 8 bits of each quad. - __m128i alpha = _mm_srli_epi32(src_pixel, 24); - - // Copy alpha to upper 3rd byte of each quad - alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); - - // Subtract alphas from 256, to get 1..256 - alpha = _mm_sub_epi16(c_256, alpha); - - // Multiply by red and blue by src alpha. - dst_rb = _mm_mullo_epi16(dst_rb, alpha); - // Multiply by alpha and green by src alpha. - dst_ag = _mm_mullo_epi16(dst_ag, alpha); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // Mask out high bits (already in the right place) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; - } + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) + __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 255, to get 0..255 + alpha = _mm_sub_epi16(c_255, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // dst_rb_low = (dst_rb >> 8) + __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); + __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); + + // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 + dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); + dst_rb = _mm_add_epi16(dst_rb, c_128); + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask + dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); + dst_ag = _mm_add_epi16(dst_ag, c_128); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + #else + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 256, to get 1..256 + alpha = _mm_sub_epi16(c_256, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // Mask out high bits (already in the right place) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } #endif + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<SkPMColor*>(d); + } - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<SkPMColor*>(d); while (count > 0) { *dst = SkPMSrcOver(*src, *dst); src++; @@ -206,70 +227,80 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, return; } - uint32_t src_scale = SkAlpha255To256(alpha); - - const __m128i *s = reinterpret_cast<const __m128i*>(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i src_scale_wide = _mm_set1_epi16(src_scale); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); - - // Divide by 256. - dst_alpha = _mm_srli_epi16(dst_alpha, 8); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_rb = _mm_srli_epi16(src_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; + if (count >= 4) { + while (((size_t)dst & 0x0F) != 0) { + *dst = SkBlendARGB32(*src, *dst, alpha); + src++; + dst++; + count--; + } + + uint32_t src_scale = SkAlpha255To256(alpha); + + const __m128i *s = reinterpret_cast<const __m128i*>(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Put per-pixel alpha in low byte of each word. + __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); + dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); + + // dst_alpha = dst_alpha * src_scale + dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); + + // Divide by 256. + dst_alpha = _mm_srli_epi16(dst_alpha, 8); + + // Subtract alphas from 256, to get 1..256 + dst_alpha = _mm_sub_epi16(c_256, dst_alpha); + + // Multiply red and blue by dst pixel alpha. + dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); + // Multiply alpha and green by dst pixel alpha. + dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); + + // Multiply red and blue by global alpha. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + // Multiply alpha and green by global alpha. + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_rb = _mm_srli_epi16(src_rb, 8); + + // Mask out low bits (goodies already in the right place; no need to divide) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + src_pixel = _mm_or_si128(src_rb, src_ag); + + // Add two pixels into result. + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<SkPMColor*>(d); } - src = reinterpret_cast<const SkPMColor*>(s); - dst = reinterpret_cast<SkPMColor*>(d); + while (count > 0) { *dst = SkBlendARGB32(*src, *dst, alpha); src++; |