diff options
author | mtklein <mtklein@chromium.org> | 2015-03-06 06:15:44 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-03-06 06:15:44 -0800 |
commit | 91fd7371ec80724ec53aae8f2d5a6753499d8963 (patch) | |
tree | 57fdec71f3ca1b8fb451558a6b03a9bf12ad0812 | |
parent | cdf34cdfb930a876910cffd96f080fce55f003bd (diff) |
SKPMFloat: we can beat the naive loops when clamping
Clamping 4 at a time is now about 15% faster than 1 at a time with SSSE3.
Clamping 4 at a time is now about 20% faster with SSE2,
and this applies to non-clamping too (we still just clamp there).
In all cases, 4 at a time is never worse than 1 at a time,
and not clamping is never slower than clamping.
Here's all the bench results, with the numbers for portable code as a fun point
of reference:
SSSE3:
maxrss loops min median mean max stddev samples config bench
10M 2291 4.66ns 4.66ns 4.66ns 4.68ns 0% ▆█▁▁▁▇▁▇▁▃ nonrendering SkPMFloat_get_1x
10M 2040 5.29ns 5.3ns 5.3ns 5.32ns 0% ▃▆▃▃▁▁▆▃▃█ nonrendering SkPMFloat_clamp_1x
10M 7175 4.62ns 4.62ns 4.62ns 4.63ns 0% ▁▄▃████▃▄▇ nonrendering SkPMFloat_get_4x
10M 5801 4.89ns 4.89ns 4.89ns 4.91ns 0% █▂▄▃▁▃▄█▁▁ nonrendering SkPMFloat_clamp_4x
SSE2:
maxrss loops min median mean max stddev samples config bench
10M 1601 6.02ns 6.05ns 6.04ns 6.08ns 0% █▅▄▅▄▂▁▂▂▂ nonrendering SkPMFloat_get_1x
10M 2918 6.05ns 6.06ns 6.05ns 6.06ns 0% ▂▇▁▇▇▁▇█▇▂ nonrendering SkPMFloat_clamp_1x
10M 3569 5.43ns 5.45ns 5.44ns 5.45ns 0% ▄█▂██▇▁▁▇▇ nonrendering SkPMFloat_get_4x
10M 4168 5.43ns 5.43ns 5.43ns 5.44ns 0% █▄▇▁▇▄▁▁▁▁ nonrendering SkPMFloat_clamp_4x
Portable:
maxrss loops min median mean max stddev samples config bench
10M 500 27.8ns 28.1ns 28ns 28.2ns 0% ▃█▆▃▇▃▆▁▇▂ nonrendering SkPMFloat_get_1x
10M 770 40.1ns 40.2ns 40.2ns 40.3ns 0% ▅▁▃▂▆▄█▂▅▂ nonrendering SkPMFloat_clamp_1x
10M 1269 28.4ns 28.8ns 29.1ns 32.7ns 4% ▂▂▂█▂▁▁▂▁▁ nonrendering SkPMFloat_get_4x
10M 1439 40.2ns 40.4ns 40.4ns 40.5ns 0% ▆▆▆█▁▆▅█▅▆ nonrendering SkPMFloat_clamp_4x
SkPMFloat_neon.h is still one big TODO as far as 4-at-a-time APIs go.
BUG=skia:
Review URL: https://codereview.chromium.org/982123002
-rw-r--r-- | src/core/SkPMFloat.h | 15 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSE2.h | 29 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSSE3.h | 25 | ||||
-rw-r--r-- | src/opts/SkPMFloat_neon.h | 13 | ||||
-rw-r--r-- | src/opts/SkPMFloat_none.h | 12 |
5 files changed, 81 insertions, 13 deletions
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h index 2e06ea93e6..4534f1142c 100644 --- a/src/core/SkPMFloat.h +++ b/src/core/SkPMFloat.h @@ -13,10 +13,7 @@ public: static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); } // May be more efficient than one at a time. No special alignment assumed for SkPMColors. - static void From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) { - // TODO: specialize - for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } - } + static void From4PMColors(SkPMFloat[4], const SkPMColor[4]); explicit SkPMFloat(SkPMColor); SkPMFloat(float a, float r, float g, float b) { @@ -51,14 +48,8 @@ public: SkPMColor clamped() const; // Will clamp all values to [0, 255]. Then may assert isValid(). // 4-at-a-time versions of get() and clamped(). Like From4PMColors(), no alignment assumed. - static void To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { - // TODO: specialize - for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } - } - static void ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { - // TODO: specialize - for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); } - } + static void To4PMColors(SkPMColor[4], const SkPMFloat[4]); + static void ClampTo4PMColors(SkPMColor[4], const SkPMFloat[4]); bool isValid() const { return this->a() >= 0 && this->a() <= 255 diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h index e14f94fe13..7298b4da3c 100644 --- a/src/opts/SkPMFloat_SSE2.h +++ b/src/opts/SkPMFloat_SSE2.h @@ -19,7 +19,7 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) { inline SkPMColor SkPMFloat::get() const { SkASSERT(this->isValid()); - return this->clamped(); // At the moment, we don't know anything faster. + return this->clamped(); // Haven't beaten this yet. } inline SkPMColor SkPMFloat::clamped() const { @@ -30,3 +30,30 @@ inline SkPMColor SkPMFloat::clamped() const { SkPMColorAssert(c); return c; } + +inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) { + // Haven't beaten this yet. + for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } +} + +inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + SkASSERT(floats[0].isValid() && floats[1].isValid() + && floats[2].isValid() && floats[3].isValid()); + // Haven't beaten this yet. + ClampTo4PMColors(colors, floats); +} + +inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + // Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. + __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)), // _mm_cvtps_epi32 rounds for us! + c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)), + c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)), + c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor)); + __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), + _mm_packus_epi16(c2, c3)); + _mm_storeu_si128((__m128i*)colors, c3210); + SkPMColorAssert(colors[0]); + SkPMColorAssert(colors[1]); + SkPMColorAssert(colors[2]); + SkPMColorAssert(colors[3]); +} diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h index c4fba400dc..ff296178cc 100644 --- a/src/opts/SkPMFloat_SSSE3.h +++ b/src/opts/SkPMFloat_SSSE3.h @@ -36,3 +36,28 @@ inline SkPMColor SkPMFloat::clamped() const { SkPMColorAssert(c); return c; } + +inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) { + // Haven't beaten this yet. + for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } +} + +inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + // Haven't beaten this yet. Still faster than ClampTo4PMColors too. + for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } +} + +inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. + __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)), // _mm_cvtps_epi32 rounds for us! + c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)), + c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)), + c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor)); + __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), + _mm_packus_epi16(c2, c3)); + _mm_storeu_si128((__m128i*)colors, c3210); + SkPMColorAssert(colors[0]); + SkPMColorAssert(colors[1]); + SkPMColorAssert(colors[2]); + SkPMColorAssert(colors[3]); +} diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h index 48cc568d03..036d10d0d6 100644 --- a/src/opts/SkPMFloat_neon.h +++ b/src/opts/SkPMFloat_neon.h @@ -37,3 +37,16 @@ inline SkPMColor SkPMFloat::clamped() const { SkPMColorAssert(c); return c; } + +// TODO: we should be able to beat these loops on all three methods. +inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) { + for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } +} + +inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } +} + +inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); } +} diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h index 011ce07014..a33fa7a28d 100644 --- a/src/opts/SkPMFloat_none.h +++ b/src/opts/SkPMFloat_none.h @@ -24,3 +24,15 @@ inline SkPMColor SkPMFloat::clamped() const { b = b < 0 ? 0 : (b > 255 ? 255 : b); return SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f); } + +inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors[4]) { + for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } +} + +inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } +} + +inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { + for (int i = 0; i < 4; i++) { colors[i] = floats[i].clamped(); } +} |