aboutsummaryrefslogtreecommitdiffhomepage
path: root/test/array_cwise.cpp
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-27 18:55:15 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-28 16:08:16 +0000
commitb47c7779937c1984b7cd2f1d2f8df33d67c396f7 (patch)
tree676fe9f9be5faf255b282dde431295b1894f6813 /test/array_cwise.cpp
parent29f0917a43bc4aefb987b180ad6cf19626fb0c23 (diff)
Block transposeInPlace() when the matrix is real and square. This yields a large speedup because we transpose in registers (or L1 if we spill), instead of one packet at a time, which in the worst case makes the code write to the same cache line PacketSize times instead of once.
rmlarsen@rmlarsen4:.../eigen_bench/google3$ benchy --benchmarks=.*TransposeInPlace.*float.* --reference=srcfs experimental/users/rmlarsen/bench:matmul_bench 10 / 10 [====================================================================================================================================================================================================================] 100.00% 2m50s (Generated by http://go/benchy. Settings: --runs 5 --benchtime 1s --reference "srcfs" --benchmarks ".*TransposeInPlace.*float.*" experimental/users/rmlarsen/bench:matmul_bench) name old time/op new time/op delta BM_TransposeInPlace<float>/4 9.84ns ± 0% 6.51ns ± 0% -33.80% (p=0.008 n=5+5) BM_TransposeInPlace<float>/8 23.6ns ± 1% 17.6ns ± 0% -25.26% (p=0.016 n=5+4) BM_TransposeInPlace<float>/16 78.8ns ± 0% 60.3ns ± 0% -23.50% (p=0.029 n=4+4) BM_TransposeInPlace<float>/32 302ns ± 0% 229ns ± 0% -24.40% (p=0.008 n=5+5) BM_TransposeInPlace<float>/59 1.03µs ± 0% 0.84µs ± 1% -17.87% (p=0.016 n=5+4) BM_TransposeInPlace<float>/64 1.20µs ± 0% 0.89µs ± 1% -25.81% (p=0.008 n=5+5) BM_TransposeInPlace<float>/128 8.96µs ± 0% 3.82µs ± 2% -57.33% (p=0.008 n=5+5) BM_TransposeInPlace<float>/256 152µs ± 3% 17µs ± 2% -89.06% (p=0.008 n=5+5) BM_TransposeInPlace<float>/512 837µs ± 1% 208µs ± 0% -75.15% (p=0.008 n=5+5) BM_TransposeInPlace<float>/1k 4.28ms ± 2% 1.08ms ± 2% -74.72% (p=0.008 n=5+5)
Diffstat (limited to 'test/array_cwise.cpp')
-rw-r--r--test/array_cwise.cpp6
1 files changed, 5 insertions, 1 deletions
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index e7af1a92f..950cc9650 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -495,7 +495,11 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
VERIFY_IS_APPROX(m2, m1.transpose());
m2.transposeInPlace();
VERIFY_IS_APPROX(m2, m1);
-
+ // Check vectorized inplace transpose.
+ ArrayType m5 = ArrayType::Random(130, 130);
+ ArrayType m6 = m5;
+ m6.transposeInPlace();
+ VERIFY_IS_APPROX(m6, m5.transpose());
}
template<typename ArrayType> void min_max(const ArrayType& m)