Start fresh on swizzles

This removes all the existing Sk4x swizzles and adds badc(), which is both fast on all implementations and currently useful. BUG=skia: Review URL: https://codereview.chromium.org/997353005
author: mtklein <mtklein@chromium.org> 2015-03-23 13:19:03 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-03-23 13:19:03 -0700
commit: 2e220fb9ece0fcce8a4c4a3838bbe64d7492b139 (patch)
tree: f015e15a2305bed415e2e2f87aee18d287256b4e
parent: f7edcdedb64ee0d4a5c88807cd75ed1700e5bcce (diff)
5 files changed, 9 insertions, 70 deletions
diff --git a/src/core/Sk4x.h b/src/core/Sk4x.h
index ebbc20bdcb..324ecc4bd8 100644
--- a/src/core/Sk4x.h
+++ b/src/core/Sk4x.h
@@ -92,16 +92,8 @@ public:
     static Sk4x Min(const Sk4x& a, const Sk4x& b);
     static Sk4x Max(const Sk4x& a, const Sk4x& b);
 
-    // Swizzles follow OpenCL xyzw convention.
-    Sk4x zwxy() const;
-
-    // When there's a second argument, it's abcd.
-    static Sk4x XYAB(const Sk4x& xyzw, const Sk4x& abcd);
-    static Sk4x ZWCD(const Sk4x& xyzw, const Sk4x& abcd);
-
-    // TODO: these are particularly efficient in SSE.  Useful?  Also efficient in NEON?
-    // static Sk4x XAYB(const Sk4x& xyzw, const Sk4x& abcd);
-    // static Sk4x ZCWD(const Sk4x& xyzw, const Sk4x& abcd);
+    // Swizzles, where this == abcd.
+    Sk4x badc() const;
 
 private:
     // It's handy to have Sk4f and Sk4i be mutual friends.
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
index 41f6f46885..892fcb9d41 100644
--- a/src/opts/Sk4x_neon.h
+++ b/src/opts/Sk4x_neon.h
@@ -99,26 +99,7 @@ M(Sk4i) greaterThanEqual(const Sk4f& o) const { return vreinterpretq_s32_u32(vcg
 M(Sk4f) Min(const Sk4f& a, const Sk4f& b) { return vminq_f32(a.fVec, b.fVec); }
 M(Sk4f) Max(const Sk4f& a, const Sk4f& b) { return vmaxq_f32(a.fVec, b.fVec); }
 
-// These shuffle operations are implemented more efficiently with SSE.
-// NEON has efficient zip, unzip, and transpose, but it is more costly to
-// exploit zip and unzip in order to shuffle.
-M(Sk4f) zwxy() const {
-    float32x4x2_t zip = vzipq_f32(fVec, vdupq_n_f32(0.0));
-    return vuzpq_f32(zip.val[1], zip.val[0]).val[0];
-}
-// Note that XYAB and ZWCD share code.  If both are needed, they could be
-// implemented more efficiently together.  Also, ABXY and CDZW are available
-// as well.
-M(Sk4f) XYAB(const Sk4f& xyzw, const Sk4f& abcd) {
-    float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);
-    float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);
-    return vuzpq_f32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];
-}
-M(Sk4f) ZWCD(const Sk4f& xyzw, const Sk4f& abcd) {
-    float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);
-    float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);
-    return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
-}
+M(Sk4f) badc() const { return vrev64q_f32(fVec); }
 
 // Sk4i Methods
 #undef M
@@ -174,26 +155,7 @@ M(Sk4i) multiply(const Sk4i& o) const { return vmulq_s32(fVec, o.fVec); }
 M(Sk4i) Min(const Sk4i& a, const Sk4i& b) { return vminq_s32(a.fVec, b.fVec); }
 M(Sk4i) Max(const Sk4i& a, const Sk4i& b) { return vmaxq_s32(a.fVec, b.fVec); }
 
-// These shuffle operations are implemented more efficiently with SSE.
-// NEON has efficient zip, unzip, and transpose, but it is more costly to
-// exploit zip and unzip in order to shuffle.
-M(Sk4i) zwxy() const {
-    int32x4x2_t zip = vzipq_s32(fVec, vdupq_n_s32(0.0));
-    return vuzpq_s32(zip.val[1], zip.val[0]).val[0];
-}
-// Note that XYAB and ZWCD share code.  If both are needed, they could be
-// implemented more efficiently together.  Also, ABXY and CDZW are available
-// as well.
-M(Sk4i) XYAB(const Sk4i& xyzw, const Sk4i& abcd) {
-    int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);
-    int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);
-    return vuzpq_s32(xayb_zcwd.val[0], axby_czdw.val[0]).val[0];
-}
-M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {
-    int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);
-    int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);
-    return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
-}
+M(Sk4i) badc() const { return vrev64q_s32(fVec); }
 
 #undef M
 
diff --git a/src/opts/Sk4x_none.h b/src/opts/Sk4x_none.h
index 3b7ad46e73..8e17768af6 100644
--- a/src/opts/Sk4x_none.h
+++ b/src/opts/Sk4x_none.h
@@ -111,9 +111,7 @@ M(template <int m, int a, int s, int k> Sk4x<T>) Shuffle(const Sk4x<T>& x, const
                 k < 4 ? x.fVec[k] : y.fVec[k-4]);
 }
 
-M(Sk4x<T>) zwxy() const                             { return Shuffle<2,3,0,1>(*this, *this); }
-M(Sk4x<T>) XYAB(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<0,1,4,5>( xyzw,  abcd); }
-M(Sk4x<T>) ZWCD(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<2,3,6,7>( xyzw,  abcd); }
+M(Sk4x<T>) badc() const { return Shuffle<1,0,3,2>(*this, *this); }
 
 #undef M
 
diff --git a/src/opts/Sk4x_sse.h b/src/opts/Sk4x_sse.h
index a923a7404d..b2419e543a 100644
--- a/src/opts/Sk4x_sse.h
+++ b/src/opts/Sk4x_sse.h
@@ -62,17 +62,7 @@ template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {
 // They're all bit-preserving operations so it shouldn't matter.
 
 template <typename T>
-Sk4x<T> Sk4x<T>::zwxy() const { return _mm_shuffle_epi32(as_4i(fVec), _MM_SHUFFLE(1,0,3,2)); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::XYAB(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return _mm_movelh_ps(as_4f(a.fVec), as_4f(b.fVec));
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::ZWCD(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return _mm_movehl_ps(as_4f(b.fVec), as_4f(a.fVec));
-}
+Sk4x<T> Sk4x<T>::badc() const { return _mm_shuffle_epi32(as_4i(fVec), _MM_SHUFFLE(2,3,0,1)); }
 
 // Now we'll write all Sk4f specific methods.  This M() macro will remove some noise.
 #define M(...) template <> inline __VA_ARGS__ Sk4f::
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp
index 8c3b977b5b..cbae492a57 100644
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -135,10 +135,7 @@ DEF_TEST(Sk4x_MinMax, r) {
 }
 
 DEF_TEST(Sk4x_Swizzle, r) {
-    ASSERT_EQ(Sk4f(3,4,1,2), Sk4f(1,2,3,4).zwxy());
-    ASSERT_EQ(Sk4f(1,2,5,6), Sk4f::XYAB(Sk4f(1,2,3,4), Sk4f(5,6,7,8)));
-    ASSERT_EQ(Sk4f(3,4,7,8), Sk4f::ZWCD(Sk4f(1,2,3,4), Sk4f(5,6,7,8)));
-    ASSERT_EQ(Sk4i(3,4,1,2), Sk4i(1,2,3,4).zwxy());
-    ASSERT_EQ(Sk4i(1,2,5,6), Sk4i::XYAB(Sk4i(1,2,3,4), Sk4i(5,6,7,8)));
-    ASSERT_EQ(Sk4i(3,4,7,8), Sk4i::ZWCD(Sk4i(1,2,3,4), Sk4i(5,6,7,8)));
+    ASSERT_EQ(Sk4f(1,2,3,4).badc(), Sk4f(2,1,4,3));
+
+    ASSERT_EQ(Sk4i(1,2,3,4).badc(), Sk4i(2,1,4,3));
 }
author	mtklein <mtklein@chromium.org>	2015-03-23 13:19:03 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-03-23 13:19:03 -0700
commit	2e220fb9ece0fcce8a4c4a3838bbe64d7492b139 (patch)
tree	f015e15a2305bed415e2e2f87aee18d287256b4e
parent	f7edcdedb64ee0d4a5c88807cd75ed1700e5bcce (diff)