diff options
author | mtklein <mtklein@chromium.org> | 2015-03-23 12:01:45 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-03-23 12:01:46 -0700 |
commit | 92d04da38f03dfabd8cd9a7244588a49be9a2f41 (patch) | |
tree | 2acec1a590c4626774a2cc1927c59b116d39c848 /src | |
parent | b8a1d30a42d13ae83690b2d854a024d9b56e7b71 (diff) |
Replace _mm_cvtps_epi32(x) with _mm_cvttps_epi32(_mm_add_ps(0.5f), x).
We don't have control over which way _mm_cvtps_epi32 rounds.
- This makes the SSE SkPMFloat rounding consistent with _neon and _none.
- Sk4f::cast<Sk4i>() is closer to (int)float's behavior. (Correct when >=0).
Add tests that would fail at head.
BUG=skia:
Review URL: https://codereview.chromium.org/1029163002
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/Sk4x_sse.h | 7 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSE2.h | 12 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSSE3.h | 15 |
3 files changed, 19 insertions, 15 deletions
diff --git a/src/opts/Sk4x_sse.h b/src/opts/Sk4x_sse.h index ab6876dfd7..a923a7404d 100644 --- a/src/opts/Sk4x_sse.h +++ b/src/opts/Sk4x_sse.h @@ -86,11 +86,10 @@ M(Sk4f) LoadAligned(const float fs[4]) { return _mm_load_ps (fs); } M(void) store (float fs[4]) const { _mm_storeu_ps(fs, fVec); } M(void) storeAligned(float fs[4]) const { _mm_store_ps (fs, fVec); } -template <> -M(Sk4i) reinterpret<Sk4i>() const { return as_4i(fVec); } +template <> M(Sk4i) reinterpret<Sk4i>() const { return as_4i(fVec); } -template <> -M(Sk4i) cast<Sk4i>() const { return _mm_cvtps_epi32(fVec); } +// cvttps truncates, same as (int) when positive. +template <> M(Sk4i) cast<Sk4i>() const { return _mm_cvttps_epi32(fVec); } // We're going to try a little experiment here and skip allTrue(), anyTrue(), and bit-manipulators // for Sk4f. Code that calls them probably does so accidentally. diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h index 9c1295b565..2a85b1a74f 100644 --- a/src/opts/SkPMFloat_SSE2.h +++ b/src/opts/SkPMFloat_SSE2.h @@ -25,7 +25,8 @@ inline SkPMColor SkPMFloat::get() const { } inline SkPMColor SkPMFloat::clamped() const { - __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)), fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), fix8 = _mm_packus_epi16(fix8_16, fix8_16); SkPMColor c = _mm_cvtsi128_si32(fix8); @@ -47,10 +48,11 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { // Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - __m128i c0 = _mm_cvtps_epi32(floats[0].fColors), // _mm_cvtps_epi32 rounds for us! - c1 = _mm_cvtps_epi32(floats[1].fColors), - c2 = _mm_cvtps_epi32(floats[2].fColors), - c3 = _mm_cvtps_epi32(floats[3].fColors); + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)), + c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)), + c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)), + c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors)); __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), _mm_packus_epi16(c2, c3)); _mm_storeu_si128((__m128i*)colors, c3210); diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h index b2c2b23429..ab54caf3d4 100644 --- a/src/opts/SkPMFloat_SSSE3.h +++ b/src/opts/SkPMFloat_SSSE3.h @@ -23,7 +23,8 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) { inline SkPMColor SkPMFloat::get() const { SkASSERT(this->isValid()); const int _ = 255; // _ means to zero that byte. - __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)), fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0)); SkPMColor c = _mm_cvtsi128_si32(fix8); SkPMColorAssert(c); @@ -31,7 +32,8 @@ inline SkPMColor SkPMFloat::get() const { } inline SkPMColor SkPMFloat::clamped() const { - __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)), fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), fix8 = _mm_packus_epi16(fix8_16, fix8_16); SkPMColor c = _mm_cvtsi128_si32(fix8); @@ -51,10 +53,11 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - __m128i c0 = _mm_cvtps_epi32(floats[0].fColors), // _mm_cvtps_epi32 rounds for us! - c1 = _mm_cvtps_epi32(floats[1].fColors), - c2 = _mm_cvtps_epi32(floats[2].fColors), - c3 = _mm_cvtps_epi32(floats[3].fColors); + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)), + c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)), + c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)), + c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors)); __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), _mm_packus_epi16(c2, c3)); _mm_storeu_si128((__m128i*)colors, c3210); |