aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-03-23 13:19:03 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-03-23 13:19:03 -0700
commit2e220fb9ece0fcce8a4c4a3838bbe64d7492b139 (patch)
treef015e15a2305bed415e2e2f87aee18d287256b4e
parentf7edcdedb64ee0d4a5c88807cd75ed1700e5bcce (diff)
Start fresh on swizzles
This removes all the existing Sk4x swizzles and adds badc(), which is both fast on all implementations and currently useful. BUG=skia: Review URL: https://codereview.chromium.org/997353005
-rw-r--r--src/core/Sk4x.h12
-rw-r--r--src/opts/Sk4x_neon.h42
-rw-r--r--src/opts/Sk4x_none.h4
-rw-r--r--src/opts/Sk4x_sse.h12
-rw-r--r--tests/Sk4xTest.cpp9
5 files changed, 9 insertions, 70 deletions
diff --git a/src/core/Sk4x.h b/src/core/Sk4x.h
index ebbc20bdcb..324ecc4bd8 100644
--- a/src/core/Sk4x.h
+++ b/src/core/Sk4x.h
@@ -92,16 +92,8 @@ public:
static Sk4x Min(const Sk4x& a, const Sk4x& b);
static Sk4x Max(const Sk4x& a, const Sk4x& b);
- // Swizzles follow OpenCL xyzw convention.
- Sk4x zwxy() const;
-
- // When there's a second argument, it's abcd.
- static Sk4x XYAB(const Sk4x& xyzw, const Sk4x& abcd);
- static Sk4x ZWCD(const Sk4x& xyzw, const Sk4x& abcd);
-
- // TODO: these are particularly efficient in SSE. Useful? Also efficient in NEON?
- // static Sk4x XAYB(const Sk4x& xyzw, const Sk4x& abcd);
- // static Sk4x ZCWD(const Sk4x& xyzw, const Sk4x& abcd);
+ // Swizzles, where this == abcd.
+ Sk4x badc() const;
private:
// It's handy to have Sk4f and Sk4i be mutual friends.
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
index 41f6f46885..892fcb9d41 100644
--- a/src/opts/Sk4x_neon.h
+++ b/src/opts/Sk4x_neon.h
@@ -99,26 +99,7 @@ M(Sk4i) greaterThanEqual(const Sk4f& o) const { return vreinterpretq_s32_u32(vcg
M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return vminq_f32(a.fVec, b.fVec); }
M(Sk4f) Max(const Sk4f& a, const Sk4f& b) { return vmaxq_f32(a.fVec, b.fVec); }
-// These shuffle operations are implemented more efficiently with SSE.
-// NEON has efficient zip, unzip, and transpose, but it is more costly to
-// exploit zip and unzip in order to shuffle.
-M(Sk4f) zwxy() const {
- float32x4x2_t zip = vzipq_f32(fVec, vdupq_n_f32(0.0));
- return vuzpq_f32(zip.val[1], zip.val[0]).val[0];
-}
-// Note that XYAB and ZWCD share code. If both are needed, they could be
-// implemented more efficiently together. Also, ABXY and CDZW are available
-// as well.
-M(Sk4f) XYAB(const Sk4f& xyzw, const Sk4f& abcd) {
- float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);
- float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);
- return vuzpq_f32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];
-}
-M(Sk4f) ZWCD(const Sk4f& xyzw, const Sk4f& abcd) {
- float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);
- float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);
- return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
-}
+M(Sk4f) badc() const { return vrev64q_f32(fVec); }
// Sk4i Methods
#undef M
@@ -174,26 +155,7 @@ M(Sk4i) multiply(const Sk4i& o) const { return vmulq_s32(fVec, o.fVec); }
M(Sk4i) Min(const Sk4i& a, const Sk4i& b) { return vminq_s32(a.fVec, b.fVec); }
M(Sk4i) Max(const Sk4i& a, const Sk4i& b) { return vmaxq_s32(a.fVec, b.fVec); }
-// These shuffle operations are implemented more efficiently with SSE.
-// NEON has efficient zip, unzip, and transpose, but it is more costly to
-// exploit zip and unzip in order to shuffle.
-M(Sk4i) zwxy() const {
- int32x4x2_t zip = vzipq_s32(fVec, vdupq_n_s32(0.0));
- return vuzpq_s32(zip.val[1], zip.val[0]).val[0];
-}
-// Note that XYAB and ZWCD share code. If both are needed, they could be
-// implemented more efficiently together. Also, ABXY and CDZW are available
-// as well.
-M(Sk4i) XYAB(const Sk4i& xyzw, const Sk4i& abcd) {
- int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);
- int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);
- return vuzpq_s32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];
-}
-M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {
- int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);
- int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);
- return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
-}
+M(Sk4i) badc() const { return vrev64q_s32(fVec); }
#undef M
diff --git a/src/opts/Sk4x_none.h b/src/opts/Sk4x_none.h
index 3b7ad46e73..8e17768af6 100644
--- a/src/opts/Sk4x_none.h
+++ b/src/opts/Sk4x_none.h
@@ -111,9 +111,7 @@ M(template <int m, int a, int s, int k> Sk4x<T>) Shuffle(const Sk4x<T>& x, const
k < 4 ? x.fVec[k] : y.fVec[k-4]);
}
-M(Sk4x<T>) zwxy() const { return Shuffle<2,3,0,1>(*this, *this); }
-M(Sk4x<T>) XYAB(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<0,1,4,5>( xyzw, abcd); }
-M(Sk4x<T>) ZWCD(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<2,3,6,7>( xyzw, abcd); }
+M(Sk4x<T>) badc() const { return Shuffle<1,0,3,2>(*this, *this); }
#undef M
diff --git a/src/opts/Sk4x_sse.h b/src/opts/Sk4x_sse.h
index a923a7404d..b2419e543a 100644
--- a/src/opts/Sk4x_sse.h
+++ b/src/opts/Sk4x_sse.h
@@ -62,17 +62,7 @@ template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {
// They're all bit-preserving operations so it shouldn't matter.
template <typename T>
-Sk4x<T> Sk4x<T>::zwxy() const { return _mm_shuffle_epi32(as_4i(fVec), _MM_SHUFFLE(1,0,3,2)); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::XYAB(const Sk4x<T>& a, const Sk4x<T>& b) {
- return _mm_movelh_ps(as_4f(a.fVec), as_4f(b.fVec));
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::ZWCD(const Sk4x<T>& a, const Sk4x<T>& b) {
- return _mm_movehl_ps(as_4f(b.fVec), as_4f(a.fVec));
-}
+Sk4x<T> Sk4x<T>::badc() const { return _mm_shuffle_epi32(as_4i(fVec), _MM_SHUFFLE(2,3,0,1)); }
// Now we'll write all Sk4f specific methods. This M() macro will remove some noise.
#define M(...) template <> inline __VA_ARGS__ Sk4f::
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp
index 8c3b977b5b..cbae492a57 100644
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -135,10 +135,7 @@ DEF_TEST(Sk4x_MinMax, r) {
}
DEF_TEST(Sk4x_Swizzle, r) {
- ASSERT_EQ(Sk4f(3,4,1,2), Sk4f(1,2,3,4).zwxy());
- ASSERT_EQ(Sk4f(1,2,5,6), Sk4f::XYAB(Sk4f(1,2,3,4), Sk4f(5,6,7,8)));
- ASSERT_EQ(Sk4f(3,4,7,8), Sk4f::ZWCD(Sk4f(1,2,3,4), Sk4f(5,6,7,8)));
- ASSERT_EQ(Sk4i(3,4,1,2), Sk4i(1,2,3,4).zwxy());
- ASSERT_EQ(Sk4i(1,2,5,6), Sk4i::XYAB(Sk4i(1,2,3,4), Sk4i(5,6,7,8)));
- ASSERT_EQ(Sk4i(3,4,7,8), Sk4i::ZWCD(Sk4i(1,2,3,4), Sk4i(5,6,7,8)));
+ ASSERT_EQ(Sk4f(1,2,3,4).badc(), Sk4f(2,1,4,3));
+
+ ASSERT_EQ(Sk4i(1,2,3,4).badc(), Sk4i(2,1,4,3));
}