11 files changed, 243 insertions, 75 deletions
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp
index 42b3e1285..9a8e7cf63 100644
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   int rows = m.rows();
   int cols = m.cols();
 
-  int cost = 0;
+  double cost = 0;
   for (int j=0; j<rows; ++j)
   {
     int r = std::max(rows - j -1,0);
@@ -78,10 +78,10 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   else
     std::cout << "fixed ";
   std::cout << covMat.rows() << " \t"
-            << (timerNoSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerNoSqrt.value() << " MFLOPS)\t"
-            << (timerSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerSqrt.value() << " MFLOPS)\n";
+            << (timerNoSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerNoSqrt.best() << " GFLOPS)\t"
+            << (timerSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerSqrt.best() << " GFLOPS)\n";
 
 
   #ifdef BENCH_GSL
@@ -119,13 +119,13 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
 
 int main(int argc, char* argv[])
 {
-  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,0};
-  std::cout << "size            no sqrt                           standard";
+  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,1500,0};
+  std::cout << "size            LDLT                            LLT";
 //   #ifdef BENCH_GSL
 //   std::cout << "       GSL (standard + double + ATLAS)  ";
 //   #endif
   std::cout << "\n";
-  for (uint i=0; dynsizes[i]>0; ++i)
+  for (int i=0; dynsizes[i]>0; ++i)
     benchLLT(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));
 
   benchLLT(Matrix<Scalar,2,2>());
diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt
index f8b1b2ec3..e99a0855c 100644
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@@ -1,10 +1,13 @@
 
 find_package(BLAZE)
-find_package(Boost)
+find_package(Boost COMPONENTS system)
 if (BLAZE_FOUND AND Boost_FOUND)
   include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
   btl_add_bench(btl_blaze main.cpp)
+  # Note: The newest blaze version requires C++14.
+  # Ideally, we should set this depending on the version of Blaze we found
+  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
   if(BUILD_btl_blaze)
-    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
   endif()
 endif ()
diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp
index aa4ff011f..24343dcd8 100644
--- a/bench/dense_solvers.cpp
+++ b/bench/dense_solvers.cpp
@@ -2,47 +2,74 @@
 #include "BenchTimer.h"
 #include <Eigen/Dense>
 #include <map>
+#include <vector>
 #include <string>
+#include <sstream>
 using namespace Eigen;
 
-std::map<std::string,Array<float,1,4> > results;
+std::map<std::string,Array<float,1,8,DontAlign|RowMajor> > results;
+std::vector<std::string> labels;
+std::vector<Array2i> sizes;
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute_norm_equation(Solver &solver, const MatrixType &A) {
+  if(A.rows()!=A.cols())
+    solver.compute(A.transpose()*A);
+  else
+    solver.compute(A);
+}
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute(Solver &solver, const MatrixType &A) {
+  solver.compute(A);
+}
 
 template<typename Scalar,int Size>
-void bench(int id, int size = Size)
+void bench(int id, int rows, int size = Size)
 {
-  typedef Matrix<Scalar,Size,Size> Mat;
-  Mat A(size,size);
+  typedef Matrix<Scalar,Dynamic,Size> Mat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatDyn;
+  typedef Matrix<Scalar,Size,Size> MatSquare;
+  Mat A(rows,size);
   A.setRandom();
-  A = A*A.adjoint();
+  if(rows==size)
+    A = A*A.adjoint();
   BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd;
+
+  int svd_opt = ComputeThinU|ComputeThinV;
   
-  int tries = 3;
+  int tries = 5;
   int rep = 1000/size;
   if(rep==0) rep = 1;
 //   rep = rep*rep;
   
-  LLT<Mat> llt(A);
-  LDLT<Mat> ldlt(A);
-  PartialPivLU<Mat> lu(A);
-  FullPivLU<Mat> fplu(A);
-  HouseholderQR<Mat> qr(A);
-  ColPivHouseholderQR<Mat> cpqr(A);
-  CompleteOrthogonalDecomposition<Mat> cod(A);
-  FullPivHouseholderQR<Mat> fpqr(A);
-  JacobiSVD<Mat> jsvd(A.rows(),A.cols());
-  BDCSVD<Mat> bdcsvd(A.rows(),A.cols());
+  LLT<MatSquare> llt(size);
+  LDLT<MatSquare> ldlt(size);
+  PartialPivLU<MatSquare> lu(size);
+  FullPivLU<MatSquare> fplu(size,size);
+  HouseholderQR<Mat> qr(A.rows(),A.cols());
+  ColPivHouseholderQR<Mat> cpqr(A.rows(),A.cols());
+  CompleteOrthogonalDecomposition<Mat> cod(A.rows(),A.cols());
+  FullPivHouseholderQR<Mat> fpqr(A.rows(),A.cols());
+  JacobiSVD<MatDyn> jsvd(A.rows(),A.cols());
+  BDCSVD<MatDyn> bdcsvd(A.rows(),A.cols());
   
-  BENCH(t_llt, tries, rep, llt.compute(A));
-  BENCH(t_ldlt, tries, rep, ldlt.compute(A));
-  BENCH(t_lu, tries, rep, lu.compute(A));
-  BENCH(t_fplu, tries, rep, fplu.compute(A));
-  BENCH(t_qr, tries, rep, qr.compute(A));
-  BENCH(t_cpqr, tries, rep, cpqr.compute(A));
-  BENCH(t_cod, tries, rep, cod.compute(A));
-  BENCH(t_fpqr, tries, rep, fpqr.compute(A));
+  BENCH(t_llt, tries, rep, compute_norm_equation(llt,A));
+  BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A));
+  BENCH(t_lu, tries, rep, compute_norm_equation(lu,A));
+  if(size<=1000)
+    BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A));
+  BENCH(t_qr, tries, rep, compute(qr,A));
+  BENCH(t_cpqr, tries, rep, compute(cpqr,A));
+  BENCH(t_cod, tries, rep, compute(cod,A));
+  if(size*rows<=10000000)
+    BENCH(t_fpqr, tries, rep, compute(fpqr,A));
   if(size<500) // JacobiSVD is really too slow for too large matrices
-    BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
-  BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV));
+    BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt));
+//   if(size*rows<=20000000)
+    BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt));
   
   results["LLT"][id] = t_llt.best();
   results["LDLT"][id] = t_ldlt.best();
@@ -52,33 +79,108 @@ void bench(int id, int size = Size)
   results["ColPivHouseholderQR"][id] = t_cpqr.best();
   results["CompleteOrthogonalDecomposition"][id] = t_cod.best();
   results["FullPivHouseholderQR"][id] = t_fpqr.best();
-  results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
+  results["JacobiSVD"][id] = t_jsvd.best();
   results["BDCSVD"][id] = t_bdcsvd.best();
 }
 
+
 int main()
 {
+  labels.push_back("LLT");
+  labels.push_back("LDLT");
+  labels.push_back("PartialPivLU");
+  labels.push_back("FullPivLU");
+  labels.push_back("HouseholderQR");
+  labels.push_back("ColPivHouseholderQR");
+  labels.push_back("CompleteOrthogonalDecomposition");
+  labels.push_back("FullPivHouseholderQR");
+  labels.push_back("JacobiSVD");
+  labels.push_back("BDCSVD");
+
+  for(int i=0; i<labels.size(); ++i)
+    results[labels[i]].fill(-1);
+
   const int small = 8;
-  const int medium = 100;
-  const int large = 1000;
-  const int xl = 4000;
-  
-  bench<float,small>(0);
-  bench<float,Dynamic>(1,medium);
-  bench<float,Dynamic>(2,large);
-  bench<float,Dynamic>(3,xl);
-  
-  IOFormat fmt(3, 0, " \t", "\n", "", "");
-  
-  std::cout << "solver/size                           " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
-  std::cout << "LLT                             (ms)  " << (results["LLT"]/1000.).format(fmt) << "\n";
-  std::cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
+  sizes.push_back(Array2i(small,small));
+  sizes.push_back(Array2i(100,100));
+  sizes.push_back(Array2i(1000,1000));
+  sizes.push_back(Array2i(4000,4000));
+  sizes.push_back(Array2i(10000,small));
+  sizes.push_back(Array2i(10000,100));
+  sizes.push_back(Array2i(10000,1000));
+  sizes.push_back(Array2i(10000,4000));
+
+  using namespace std;
+
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    cout << sizes[k](0) << "x" << sizes[k](1) << "...\n";
+    bench<float,Dynamic>(k,sizes[k](0),sizes[k](1));
+  }
+
+  cout.width(32);
+  cout << "solver/size";
+  cout << "  ";
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    std::stringstream ss;
+    ss << sizes[k](0) << "x" << sizes[k](1);
+    cout.width(10); cout << ss.str(); cout << " ";
+  }
+  cout << endl;
+
+
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout.width(32); cout << labels[i]; cout << "  ";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      cout.width(10);
+      if(r(k)>=1e6)  cout << "-";
+      else           cout << r(k);
+      cout << " ";
+    }
+    cout << endl;
+  }
+
+  // HTML output
+  cout << "<table class=\"manual\">" << endl;
+  cout << "<tr><th>solver/size</th>" << endl;
+  for(int k=0; k<sizes.size(); ++k)
+    cout << "  <th>" << sizes[k](0) << "x" << sizes[k](1) << "</th>";
+  cout << "</tr>" << endl;
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout << "<tr";
+    if(i%2==1) cout << " class=\"alt\"";
+    cout << "><td>" << labels[i] << "</td>";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      if(r(k)>=1e6) cout << "<td>-</td>";
+      else
+      {
+        cout << "<td>" << r(k);
+        if(i>0)
+          cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")";
+        if(i<4 && sizes[k](0)!=sizes[k](1))
+          cout << " <sup><a href=\"#note_ls\">*</a></sup>";
+        cout << "</td>";
+      }
+    }
+    cout << "</tr>" << endl;
+  }
+  cout << "</table>" << endl;
+
+//   cout << "LLT                             (ms)  " << (results["LLT"]*1000.).format(fmt) << "\n";
+//   cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
 }
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index fb3e48e99..af8eb9b8f 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -42,6 +42,20 @@ before-evaluators
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
-
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
index b443218d7..6dc370155 100644
--- a/bench/perf_monitoring/gemm/lazy_gemm.cpp
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -12,12 +12,13 @@ using namespace Eigen;
 typedef SCALAR Scalar;
 
 template<typename MatA, typename MatB, typename MatC>
-inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
 {
-  escape((void*)A.data());
-  escape((void*)B.data());
+//   escape((void*)A.data());
+//   escape((void*)B.data());
   C.noalias() += A.lazyProduct(B);
-  escape((void*)C.data());
+//   escape((void*)C.data());
 }
 
 template<int m, int n, int k, int TA>
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
index 4d6053501..cd3214ac9 100755
--- a/bench/perf_monitoring/gemm/make_plot.sh
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -25,7 +25,7 @@ echo "set xtics rotate 1" >> $WHAT.gnuplot
 echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
 echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
 
-col=`cat settings.txt | wc -l`
+col=`cat $bench"_settings.txt" | wc -l`
 echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
 echo " " >>  $WHAT.gnuplot
 
diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh
index bfb4ecfac..9d6ee40bc 100755
--- a/bench/perf_monitoring/gemm/run.sh
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -138,15 +138,15 @@ do
 done
 
 echo "Float:"
-cat $PREFIX"s"$bench.out"
-echo ""
+cat $PREFIX"s""$bench.out"
+echo " "
 
 echo "Double:"
-cat $PREFIX"d"$bench.out"
+cat $PREFIX"d""$bench.out"
 echo ""
 
 echo "Complex:"
-cat $PREFIX"c"$bench.out"
+cat $PREFIX"c""$bench.out"
 echo ""
 
 ./make_plot.sh $PREFIX"s"$bench $bench
diff --git a/bench/tensors/README b/bench/tensors/README
index 4398aa81b..803cb8ef8 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -1,12 +1,15 @@
-Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+The tensor benchmark suite is made of several parts.
+
+The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
 
 To compile the floating point CPU benchmarks, simply call:
 g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
 
 To compile the floating point GPU benchmarks, simply call:
-nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu
-
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
 
-To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
-nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu
+We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
+nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
+last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call 
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc
new file mode 100644
index 000000000..f9e57ad47
--- /dev/null
+++ b/bench/tensors/contraction_benchmarks_cpu.cc
@@ -0,0 +1,39 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+
+// Contractions for number of threads ranging from 1 to 32
+// Dimensions are Rows, Cols, Depth
+#define BM_ContractionCPU(D1, D2, D3)                                         \
+  static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
+    StopBenchmarkTiming();                                                    \
+    CREATE_THREAD_POOL(Threads);                                              \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+    suite.contraction(iters);                                                 \
+  }                                                                           \
+  BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
+
+
+// Vector Matrix and Matrix Vector products
+BM_ContractionCPU(1, 2000, 500);
+BM_ContractionCPU(2000, 1, 500);
+
+// Various skinny matrices
+BM_ContractionCPU(250, 3, 512);
+BM_ContractionCPU(1500, 3, 512);
+
+BM_ContractionCPU(512, 800, 4);
+BM_ContractionCPU(512, 80, 800);
+BM_ContractionCPU(512, 80, 13522);
+BM_ContractionCPU(1, 80, 13522);
+
+BM_ContractionCPU(3200, 512, 4);
+BM_ContractionCPU(3200, 512, 80);
+BM_ContractionCPU(3200, 80, 512);
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 62533a608..c2fb3dede 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -178,9 +178,14 @@ template <typename Device, typename T> class BenchmarkSuite {
     size_b[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
 
+#if defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
+                         Eigen::type2indexpair<2, 1> > paddings;
+#else
     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+#endif
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -368,7 +373,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
         b_, input_size);
     Eigen::array<TensorIndex, 0> output_size;
-    TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
+    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
 
     StartBenchmarkTiming();
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 14876556e..65784d0d6 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc);
 BM_FuncGPU(transcendentalFunc);
 BM_FuncGPU(rowReduction);
 BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
 
 
 // Contractions