From e2b383632699684d06ae180b3ad85cfb0189973a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 17:13:33 +0200
Subject: Include recent changesets that played with product's kernel

---
 bench/perf_monitoring/gemm/changesets.txt | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'bench/perf_monitoring')
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index fb3e48e99..d00b4603a 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -44,4 +44,8 @@ before-evaluators
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
+8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
 
-- 
cgit v1.2.3


From b74e45906c67ff5166d070c34fe985167f7d4b3e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 4 Jul 2016 14:30:50 +0200
Subject: Few fixes in perf-monitoring.

---
 bench/perf_monitoring/gemm/make_plot.sh | 2 +-
 bench/perf_monitoring/gemm/run.sh       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'bench/perf_monitoring')

diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
index 4d6053501..cd3214ac9 100755
--- a/bench/perf_monitoring/gemm/make_plot.sh
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -25,7 +25,7 @@ echo "set xtics rotate 1" >> $WHAT.gnuplot
 echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
 echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
 
-col=`cat settings.txt | wc -l`
+col=`cat $bench"_settings.txt" | wc -l`
 echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
 echo " " >>  $WHAT.gnuplot
 
diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh
index bfb4ecfac..9d6ee40bc 100755
--- a/bench/perf_monitoring/gemm/run.sh
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -138,15 +138,15 @@ do
 done
 
 echo "Float:"
-cat $PREFIX"s"$bench.out"
-echo ""
+cat $PREFIX"s""$bench.out"
+echo " "
 
 echo "Double:"
-cat $PREFIX"d"$bench.out"
+cat $PREFIX"d""$bench.out"
 echo ""
 
 echo "Complex:"
-cat $PREFIX"c"$bench.out"
+cat $PREFIX"c""$bench.out"
 echo ""
 
 ./make_plot.sh $PREFIX"s"$bench $bench
-- 
cgit v1.2.3


From dacc544b8445930417d827060061797991e5126d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 4 Jul 2016 14:32:15 +0200
Subject: asm escape was not strong enough to prevent too aggressive compiler
 optimization let's fallback to no-inline.

---
 bench/perf_monitoring/gemm/lazy_gemm.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'bench/perf_monitoring')

diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
index b443218d7..6dc370155 100644
--- a/bench/perf_monitoring/gemm/lazy_gemm.cpp
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -12,12 +12,13 @@ using namespace Eigen;
 typedef SCALAR Scalar;
 
 template<typename MatA, typename MatB, typename MatC>
-inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
 {
-  escape((void*)A.data());
-  escape((void*)B.data());
+//   escape((void*)A.data());
+//   escape((void*)B.data());
   C.noalias() += A.lazyProduct(B);
-  escape((void*)C.data());
+//   escape((void*)C.data());
 }
 
 template<int m, int n, int k, int TA>
-- 
cgit v1.2.3


From 75e80792cc98b09d4ba92df67ab810d9af983e87 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 4 Jul 2016 14:32:34 +0200
Subject: Update relevent list of changesets.

---
 bench/perf_monitoring/gemm/changesets.txt | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'bench/perf_monitoring')

diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index d00b4603a..af8eb9b8f 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -42,10 +42,20 @@ before-evaluators
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
-8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
 8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
 8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
 8985:d935df21a082   # Remove the rotating kernel.
-
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
-- 
cgit v1.2.3