diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-05-21 12:43:07 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-05-21 12:43:07 +0000 |
commit | f0ea77a3630e6d1c01d83aa5430b3780da9e88b6 (patch) | |
tree | 8a4371e68c70a8b71b59e73bbae6ae4c8afd109b | |
parent | 74ff1badf43169fa4ecc952994930edec4e56bee (diff) |
SSE2 implementation of memcpy32
With SSE2 version memcpy32, S32_Opaque_BlitRow32() in SkBlitRow_D32.cpp
has about 30% performance improvement. Here are the data on desktop
i7-3770.
before:
bitmap_scale_filter_90_90 8888: cmsecs = 2.01
bitmaprect_FF_filter_trans 8888: cmsecs = 3.61
bitmaprect_FF_nofilter_trans 8888: cmsecs = 3.57
bitmaprect_FF_filter_identity 8888: cmsecs = 3.53
bitmaprect_FF_nofilter_identity 8888: cmsecs = 3.53
bitmap_4444_update 8888: cmsecs = 4.84
bitmap_4444_update_volatile 8888: cmsecs = 4.81
bitmap_4444 8888: cmsecs = 4.81
after:
bitmap_scale_filter_90_90 8888: cmsecs = 1.83
bitmaprect_FF_filter_trans 8888: cmsecs = 2.36
bitmaprect_FF_nofilter_trans 8888: cmsecs = 2.36
bitmaprect_FF_filter_identity 8888: cmsecs = 2.60
bitmaprect_FF_nofilter_identity 8888: cmsecs = 2.63
bitmap_4444_update 8888: cmsecs = 3.30
bitmap_4444_update_volatile 8888: cmsecs = 3.30
bitmap_4444 8888: cmsecs = 3.29
BUG=skia:
R=mtklein@google.com, reed@google.com, bsalomon@google.com
Author: qiankun.miao@intel.com
Review URL: https://codereview.chromium.org/285313002
git-svn-id: http://skia.googlecode.com/svn/trunk@14822 2bbb7eff-a529-9590-31e7-b0007b416f81
-rw-r--r-- | include/core/SkUtils.h | 9 | ||||
-rw-r--r-- | src/core/SkBlitRow_D32.cpp | 2 | ||||
-rw-r--r-- | src/core/SkUtils.cpp | 46 | ||||
-rw-r--r-- | src/opts/SkUtils_opts_SSE2.cpp | 30 | ||||
-rw-r--r-- | src/opts/SkUtils_opts_SSE2.h | 1 | ||||
-rw-r--r-- | src/opts/SkUtils_opts_arm.cpp | 4 | ||||
-rw-r--r-- | src/opts/SkUtils_opts_none.cpp | 4 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 8 |
8 files changed, 103 insertions, 1 deletions
diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h index 996a82e7dc..d522ae0dea 100644 --- a/include/core/SkUtils.h +++ b/include/core/SkUtils.h @@ -30,6 +30,15 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count); typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count); SkMemset32Proc SkMemset32GetPlatformProc(); +/** Similar to memcpy(), but it copies count 32bit values from src to dst. + @param dst The memory to have value copied into it + @param src The memory to have value copied from it + @param count The number of values should be copied. +*/ +void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count); +typedef void (*SkMemcpy32Proc)(uint32_t dst[], const uint32_t src[], int count); +SkMemcpy32Proc SkMemcpy32GetPlatformProc(); + /////////////////////////////////////////////////////////////////////////////// #define kMaxBytesInUTF8Sequence 4 diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp index c858af6318..f8cf9a34a9 100644 --- a/src/core/SkBlitRow_D32.cpp +++ b/src/core/SkBlitRow_D32.cpp @@ -18,7 +18,7 @@ static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { SkASSERT(255 == alpha); - memcpy(dst, src, count * sizeof(SkPMColor)); + sk_memcpy32(dst, src, count); } static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst, diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp index 76da23a6d0..c65947dfe9 100644 --- a/src/core/SkUtils.cpp +++ b/src/core/SkUtils.cpp @@ -34,6 +34,18 @@ *(dst)++ = value; *(dst)++ = value; \ *(dst)++ = value; *(dst)++ = value; \ } while (0) + +#define copy_16_longs(dst, src) \ + do { \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + } while (0) #endif /////////////////////////////////////////////////////////////////////////////// @@ -109,6 +121,24 @@ static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) { } } +static void sk_memcpy32_portable(uint32_t dst[], const uint32_t src[], int count) { + SkASSERT(dst != NULL && count >= 0); + + int sixteenlongs = count >> 4; + if (sixteenlongs) { + do { + copy_16_longs(dst, src); + } while (--sixteenlongs != 0); + count &= 15; + } + + if (count) { + do { + *dst++ = *src++; + } while (--count != 0); + } +} + static void choose_memset16(SkMemset16Proc* proc) { *proc = SkMemset16GetPlatformProc(); if (NULL == *proc) { @@ -141,6 +171,22 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count) { return proc(dst, value, count); } +static void choose_memcpy32(SkMemcpy32Proc* proc) { + *proc = SkMemcpy32GetPlatformProc(); + if (NULL == *proc) { + *proc = &sk_memcpy32_portable; + } +} + +void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count) { + SK_DECLARE_STATIC_ONCE(once); + static SkMemcpy32Proc proc = NULL; + SkOnce(&once, choose_memcpy32, &proc); + SkASSERT(proc != NULL); + + return proc(dst, src, count); +} + /////////////////////////////////////////////////////////////////////////////// /* 0xxxxxxx 1 total diff --git a/src/opts/SkUtils_opts_SSE2.cpp b/src/opts/SkUtils_opts_SSE2.cpp index a3c5aa5dfd..bd2f9b29a4 100644 --- a/src/opts/SkUtils_opts_SSE2.cpp +++ b/src/opts/SkUtils_opts_SSE2.cpp @@ -67,3 +67,33 @@ void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count) --count; } } + +void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count) +{ + if (count >= 16) { + while (((size_t)dst) & 0x0F) { + *dst++ = *src++; + --count; + } + __m128i *dst128 = reinterpret_cast<__m128i*>(dst); + const __m128i *src128 = reinterpret_cast<const __m128i*>(src); + while (count >= 16) { + __m128i a = _mm_loadu_si128(src128++); + __m128i b = _mm_loadu_si128(src128++); + __m128i c = _mm_loadu_si128(src128++); + __m128i d = _mm_loadu_si128(src128++); + + _mm_store_si128(dst128++, a); + _mm_store_si128(dst128++, b); + _mm_store_si128(dst128++, c); + _mm_store_si128(dst128++, d); + count -= 16; + } + dst = reinterpret_cast<uint32_t*>(dst128); + src = reinterpret_cast<const uint32_t*>(src128); + } + while (count > 0) { + *dst++ = *src++; + --count; + } +} diff --git a/src/opts/SkUtils_opts_SSE2.h b/src/opts/SkUtils_opts_SSE2.h index 5f0bc32954..009f01894b 100644 --- a/src/opts/SkUtils_opts_SSE2.h +++ b/src/opts/SkUtils_opts_SSE2.h @@ -12,5 +12,6 @@ void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count); void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count); +void sk_memcpy32_SSE2(uint32_t *dst, const uint32_t *src, int count); #endif diff --git a/src/opts/SkUtils_opts_arm.cpp b/src/opts/SkUtils_opts_arm.cpp index 2c57a3e977..b1c9d0aa93 100644 --- a/src/opts/SkUtils_opts_arm.cpp +++ b/src/opts/SkUtils_opts_arm.cpp @@ -51,3 +51,7 @@ SkMemset32Proc SkMemset32GetPlatformProc() { return arm_memset32; #endif } + +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + return NULL; +} diff --git a/src/opts/SkUtils_opts_none.cpp b/src/opts/SkUtils_opts_none.cpp index 286f10d7e5..18f52496db 100644 --- a/src/opts/SkUtils_opts_none.cpp +++ b/src/opts/SkUtils_opts_none.cpp @@ -16,3 +16,7 @@ SkMemset16Proc SkMemset16GetPlatformProc() { SkMemset32Proc SkMemset32GetPlatformProc() { return NULL; } + +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + return NULL; +} diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index e3b6b47078..ecce7ca1ec 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -305,6 +305,14 @@ SkMemset32Proc SkMemset32GetPlatformProc() { } } +SkMemcpy32Proc SkMemcpy32GetPlatformProc() { + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return sk_memcpy32_SSE2; + } else { + return NULL; + } +} + //////////////////////////////////////////////////////////////////////////////// SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { |