diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-05-21 12:43:07 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2014-05-21 12:43:07 +0000 |
commit | f0ea77a3630e6d1c01d83aa5430b3780da9e88b6 (patch) | |
tree | 8a4371e68c70a8b71b59e73bbae6ae4c8afd109b /src/core/SkUtils.cpp | |
parent | 74ff1badf43169fa4ecc952994930edec4e56bee (diff) |
SSE2 implementation of memcpy32
With SSE2 version memcpy32, S32_Opaque_BlitRow32() in SkBlitRow_D32.cpp
has about 30% performance improvement. Here are the data on desktop
i7-3770.
before:
bitmap_scale_filter_90_90 8888: cmsecs = 2.01
bitmaprect_FF_filter_trans 8888: cmsecs = 3.61
bitmaprect_FF_nofilter_trans 8888: cmsecs = 3.57
bitmaprect_FF_filter_identity 8888: cmsecs = 3.53
bitmaprect_FF_nofilter_identity 8888: cmsecs = 3.53
bitmap_4444_update 8888: cmsecs = 4.84
bitmap_4444_update_volatile 8888: cmsecs = 4.81
bitmap_4444 8888: cmsecs = 4.81
after:
bitmap_scale_filter_90_90 8888: cmsecs = 1.83
bitmaprect_FF_filter_trans 8888: cmsecs = 2.36
bitmaprect_FF_nofilter_trans 8888: cmsecs = 2.36
bitmaprect_FF_filter_identity 8888: cmsecs = 2.60
bitmaprect_FF_nofilter_identity 8888: cmsecs = 2.63
bitmap_4444_update 8888: cmsecs = 3.30
bitmap_4444_update_volatile 8888: cmsecs = 3.30
bitmap_4444 8888: cmsecs = 3.29
BUG=skia:
R=mtklein@google.com, reed@google.com, bsalomon@google.com
Author: qiankun.miao@intel.com
Review URL: https://codereview.chromium.org/285313002
git-svn-id: http://skia.googlecode.com/svn/trunk@14822 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src/core/SkUtils.cpp')
-rw-r--r-- | src/core/SkUtils.cpp | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp index 76da23a6d0..c65947dfe9 100644 --- a/src/core/SkUtils.cpp +++ b/src/core/SkUtils.cpp @@ -34,6 +34,18 @@ *(dst)++ = value; *(dst)++ = value; \ *(dst)++ = value; *(dst)++ = value; \ } while (0) + +#define copy_16_longs(dst, src) \ + do { \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + *(dst)++ = *(src)++; *(dst)++ = *(src)++; \ + } while (0) #endif /////////////////////////////////////////////////////////////////////////////// @@ -109,6 +121,24 @@ static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) { } } +static void sk_memcpy32_portable(uint32_t dst[], const uint32_t src[], int count) { + SkASSERT(dst != NULL && count >= 0); + + int sixteenlongs = count >> 4; + if (sixteenlongs) { + do { + copy_16_longs(dst, src); + } while (--sixteenlongs != 0); + count &= 15; + } + + if (count) { + do { + *dst++ = *src++; + } while (--count != 0); + } +} + static void choose_memset16(SkMemset16Proc* proc) { *proc = SkMemset16GetPlatformProc(); if (NULL == *proc) { @@ -141,6 +171,22 @@ void sk_memset32(uint32_t dst[], uint32_t value, int count) { return proc(dst, value, count); } +static void choose_memcpy32(SkMemcpy32Proc* proc) { + *proc = SkMemcpy32GetPlatformProc(); + if (NULL == *proc) { + *proc = &sk_memcpy32_portable; + } +} + +void sk_memcpy32(uint32_t dst[], const uint32_t src[], int count) { + SK_DECLARE_STATIC_ONCE(once); + static SkMemcpy32Proc proc = NULL; + SkOnce(&once, choose_memcpy32, &proc); + SkASSERT(proc != NULL); + + return proc(dst, src, count); +} + /////////////////////////////////////////////////////////////////////////////// /* 0xxxxxxx 1 total |