23 files changed, 2662 insertions, 27 deletions
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
new file mode 100644
index 000000000..d563a1d2d
--- /dev/null
+++ b/bench/analyze-blocking-sizes.cpp
@@ -0,0 +1,876 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
+
+using namespace std;
+
+const int default_precision = 4;
+
+// see --only-cubic-sizes
+bool only_cubic_sizes = false;
+
+// see --dump-tables
+bool dump_tables = false;
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  uint16_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+ostream& operator<<(ostream& s, const size_triple_t& t)
+{
+  return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
+}
+
+struct inputfile_entry_t
+{
+  uint16_t product_size;
+  uint16_t pot_block_size;
+  size_triple_t nonpot_block_size;
+  float gflops;
+};
+
+struct inputfile_t
+{
+  enum class type_t {
+    unknown,
+    all_pot_sizes,
+    default_sizes
+  };
+
+  string filename;
+  vector<inputfile_entry_t> entries;
+  type_t type;
+
+  inputfile_t(const string& fname)
+    : filename(fname)
+    , type(type_t::unknown)
+  {
+    ifstream stream(filename);
+    if (!stream.is_open()) {
+      cerr << "couldn't open input file: " << filename << endl;
+      exit(1);
+    }
+    string line;
+    while (getline(stream, line)) {
+      if (line.empty()) continue;
+      if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::all_pot_sizes;
+        continue;
+      }
+      if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::default_sizes;
+        continue;
+      }
+      
+
+      if (type == type_t::unknown) {
+        continue;
+      }
+      switch(type) {
+        case type_t::all_pot_sizes: {
+          unsigned int product_size, block_size;
+          float gflops;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x %x %f",
+                   &product_size,
+                   &block_size,
+                   &gflops);
+          if (3 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !block_size ||
+              block_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = uint16_t(block_size);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        case type_t::default_sizes: {
+          unsigned int product_size;
+          float gflops;
+          int bk, bm, bn;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
+                   &product_size,
+                   &bk, &bm, &bn,
+                   &gflops);
+          if (5 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = 0;
+          entry.nonpot_block_size = size_triple_t(bk, bm, bn);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        
+        default:
+          break;
+      }
+    }
+    stream.close();
+    if (type == type_t::unknown) {
+      cerr << "Unrecognized input file " << filename << endl;
+      exit(1);
+    }
+    if (entries.empty()) {
+      cerr << "didn't find any measurements in input file: " << filename << endl;
+      exit(1);
+    }
+  }
+};
+
+struct preprocessed_inputfile_entry_t
+{
+  uint16_t product_size;
+  uint16_t block_size;
+
+  float efficiency;
+};
+
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+  return e1.efficiency < e2.efficiency;
+}
+
+struct preprocessed_inputfile_t
+{
+  string filename;
+  vector<preprocessed_inputfile_entry_t> entries;
+
+  preprocessed_inputfile_t(const inputfile_t& inputfile)
+    : filename(inputfile.filename)
+  {
+    if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
+      abort();
+    }
+    auto it = inputfile.entries.begin();
+    auto it_first_with_given_product_size = it;
+    while (it != inputfile.entries.end()) {
+      ++it;
+      if (it == inputfile.entries.end() ||
+        it->product_size != it_first_with_given_product_size->product_size)
+      {
+        import_input_file_range_one_product_size(it_first_with_given_product_size, it);
+        it_first_with_given_product_size = it;
+      }
+    }
+  }
+
+private:
+  void import_input_file_range_one_product_size(
+    const vector<inputfile_entry_t>::const_iterator& begin,
+    const vector<inputfile_entry_t>::const_iterator& end)
+  {
+    uint16_t product_size = begin->product_size;
+    float max_gflops = 0.0f;
+    for (auto it = begin; it != end; ++it) {
+      if (it->product_size != product_size) {
+        cerr << "Unexpected ordering of entries in " << filename << endl;
+        cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
+        exit(1);
+      }
+      max_gflops = max(max_gflops, it->gflops);
+    }
+    for (auto it = begin; it != end; ++it) {
+      preprocessed_inputfile_entry_t entry;
+      entry.product_size = it->product_size;
+      entry.block_size = it->pot_block_size;
+      entry.efficiency = it->gflops / max_gflops;
+      entries.push_back(entry);
+    }
+  }
+};
+
+void check_all_files_in_same_exact_order(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
+{
+  if (preprocessed_inputfiles.empty()) {
+    return;
+  }
+
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
+  const size_t num_entries = first_file.entries.size();
+
+  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+    if (preprocessed_inputfiles[i].entries.size() != num_entries) {
+      cerr << "these files have different number of entries: "
+           << preprocessed_inputfiles[i].filename
+           << " and "
+           << first_file.filename
+           << endl;
+      exit(1);
+    }
+  }
+
+  for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
+    const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
+    const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
+    for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
+      const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
+      if (cur_file.entries[entry_index].product_size != entry_product_size ||
+          cur_file.entries[entry_index].block_size != entry_block_size)
+      {
+        cerr << "entries not in same order between these files: "
+             << first_file.filename
+             << " and "
+             << cur_file.filename
+             << endl;
+        exit(1);
+      }
+    }
+  }
+}
+
+float efficiency_of_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  if (subset.size() <= 1) {
+    return 1.0f;
+  }
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  float efficiency = 1.0f;
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float efficiency_this_product_size = 0.0f;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
+      }
+      efficiency = min(efficiency, efficiency_this_product_size);
+      if (entry_index < num_entries) {
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+
+  return efficiency;
+}
+
+void dump_table_for_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  size_t i = 0;
+  size_triple_t min_product_size(first_file.entries.front().product_size);
+  size_triple_t max_product_size(first_file.entries.back().product_size);
+  if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
+    abort();
+  }
+  if (only_cubic_sizes) {
+    cerr << "Can't generate tables with --only-cubic-sizes." << endl;
+    abort();
+  }
+  cout << "struct LookupTable {" << endl;
+  cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
+  const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
+  const size_t TableSize = NumSizes * NumSizes * NumSizes;
+  cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
+  cout << "  static const unsigned short* Data() {" << endl;
+  cout << "    static const unsigned short data[" << TableSize << "] = {";
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float best_efficiency_this_product_size = 0.0f;
+      uint16_t best_block_size_this_product_size = 0;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        if (efficiency_this_entry > best_efficiency_this_product_size) {
+          best_efficiency_this_product_size = efficiency_this_entry;
+          best_block_size_this_product_size = first_file.entries[e].block_size;
+        }
+      }
+      if ((i++) % NumSizes) {
+        cout << " ";
+      } else {
+        cout << endl << "      ";
+      }
+      cout << "0x" << hex << best_block_size_this_product_size << dec;
+      if (entry_index < num_entries) {
+        cout << ",";
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+  if (i != TableSize) {
+    cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
+    abort();
+  }
+  cout << endl << "    };" << endl;
+  cout << "    return data;" << endl;
+  cout << "  }" << endl;
+  cout << "};" << endl;
+}
+
+float efficiency_of_partition(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<vector<size_t>>& partition)
+{
+  float efficiency = 1.0f;
+  for (auto s = partition.begin(); s != partition.end(); ++s) {
+    efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
+  }
+  return efficiency;
+}
+
+void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
+{
+  assert(subset_size >= 1 && subset_size <= set_size);
+  out_subset.resize(subset_size);
+  for (size_t i = 0; i < subset_size; i++) {
+    out_subset[i] = i;
+  }
+}
+
+bool is_last_subset(const vector<size_t>& subset, size_t set_size)
+{
+  return subset[0] == set_size - subset.size();
+}
+
+void next_subset(vector<size_t>& inout_subset, size_t set_size)
+{
+  if (is_last_subset(inout_subset, set_size)) {
+    cerr << "iterating past the last subset" << endl;
+    abort();
+  }
+  size_t i = 1;
+  while (inout_subset[inout_subset.size() - i] == set_size - i) {
+    i++;
+    assert(i <= inout_subset.size());
+  }
+  size_t first_index_to_change = inout_subset.size() - i;
+  inout_subset[first_index_to_change]++;
+  size_t p = inout_subset[first_index_to_change];
+  for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
+    inout_subset[j] = ++p;
+  }
+}
+
+const size_t number_of_subsets_limit = 100;
+const size_t always_search_subsets_of_size_at_least = 2;
+
+bool is_number_of_subsets_feasible(size_t n, size_t p)
+{ 
+  assert(n>0 && p>0 && p<=n);
+  uint64_t numerator = 1, denominator = 1;
+  for (size_t i = 0; i < p; i++) {
+    numerator *= n - i;
+    denominator *= i + 1;
+    if (numerator > denominator * number_of_subsets_limit) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t max_feasible_subset_size(size_t n)
+{
+  assert(n > 0);
+  const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
+  for (size_t p = 1; p <= n - 1; p++) {
+    if (!is_number_of_subsets_feasible(n, p+1)) {
+      return max(p, minresult);
+    }
+  }
+  return n - 1;
+}
+
+void find_subset_with_efficiency_higher_than(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       float required_efficiency_to_beat,
+       vector<size_t>& inout_remainder,
+       vector<size_t>& out_subset)
+{
+  out_subset.resize(0);
+
+  if (required_efficiency_to_beat >= 1.0f) {
+    cerr << "can't beat efficiency 1." << endl;
+    abort();
+  }
+
+  while (!inout_remainder.empty()) {
+
+    vector<size_t> candidate_indices(inout_remainder.size());
+    for (size_t i = 0; i < candidate_indices.size(); i++) {
+      candidate_indices[i] = i;
+    }
+
+    size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
+    while (candidate_indices_subset_size >= 1) {
+      vector<size_t> candidate_indices_subset;
+      make_first_subset(candidate_indices_subset_size,
+                        candidate_indices_subset,
+                        candidate_indices.size());
+
+      vector<size_t> best_candidate_indices_subset;
+      float best_efficiency = 0.0f;
+      vector<size_t> trial_subset = out_subset;
+      trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
+      while (true)
+      {
+        for (size_t i = 0; i < candidate_indices_subset_size; i++) {
+          trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
+        }
+        
+        float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+        if (trial_efficiency > best_efficiency) {
+          best_efficiency = trial_efficiency;
+          best_candidate_indices_subset = candidate_indices_subset;
+        }
+        if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
+          break;
+        }
+        next_subset(candidate_indices_subset, candidate_indices.size());
+      }
+       
+      if (best_efficiency > required_efficiency_to_beat) {
+        for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
+          candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
+        }
+        candidate_indices.resize(best_candidate_indices_subset.size());
+      }
+      candidate_indices_subset_size--;
+    }
+      
+    size_t candidate_index = candidate_indices[0];
+    auto candidate_iterator = inout_remainder.begin() + candidate_index;
+    vector<size_t> trial_subset = out_subset;
+
+    trial_subset.push_back(*candidate_iterator);
+    float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+    if (trial_efficiency > required_efficiency_to_beat) {
+      out_subset.push_back(*candidate_iterator);
+      inout_remainder.erase(candidate_iterator);
+    } else {
+      break;
+    }
+  }
+}
+
+void find_partition_with_efficiency_higher_than(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       float required_efficiency_to_beat,
+       vector<vector<size_t>>& out_partition)
+{
+  out_partition.resize(0);
+
+  vector<size_t> remainder;
+  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+    remainder.push_back(i);
+  }
+
+  while (!remainder.empty()) {
+    vector<size_t> new_subset;
+    find_subset_with_efficiency_higher_than(
+      preprocessed_inputfiles,
+      required_efficiency_to_beat,
+      remainder,
+      new_subset);
+    out_partition.push_back(new_subset);
+  }
+}
+
+void print_partition(
+       const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+       const vector<vector<size_t>>& partition)
+{
+  float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+  cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency"  << endl;
+  for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
+    cout << "  Subset " << (subset - partition.begin())
+         << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
+         << endl;
+    for (auto file = subset->begin(); file != subset->end(); ++file) {
+      cout << "    " << preprocessed_inputfiles[*file].filename << endl;
+    }
+    if (dump_tables) {
+      cout << "  Table:" << endl;
+      dump_table_for_subset(preprocessed_inputfiles, *subset);
+    }
+  }
+  cout << endl;
+}
+
+struct action_t
+{
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run(const vector<string>&) const { abort(); }
+  virtual ~action_t() {}
+};
+
+struct partition_action_t : action_t
+{
+  virtual const char* invokation_name() const override { return "partition"; }
+  virtual void run(const vector<string>& input_filenames) const override
+  {
+    vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+
+    if (input_filenames.empty()) {
+      cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
+      exit(1);
+    }
+
+    for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
+      inputfile_t inputfile(*it);
+      switch (inputfile.type) {
+        case inputfile_t::type_t::all_pot_sizes:
+          preprocessed_inputfiles.emplace_back(inputfile);
+          break;
+        case inputfile_t::type_t::default_sizes:
+          cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
+               << "has no use for " << *it << " which contains measurements for default sizes." << endl;
+          exit(1);
+          break;
+        default:
+          cerr << "Unrecognized input file: " << *it << endl;
+          exit(1);
+      }
+    }
+
+    check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+    float required_efficiency_to_beat = 0.0f;
+    vector<vector<vector<size_t>>> partitions;
+    cerr << "searching for partitions...\r" << flush;
+    while (true)
+    {
+      vector<vector<size_t>> partition;
+      find_partition_with_efficiency_higher_than(
+        preprocessed_inputfiles,
+        required_efficiency_to_beat,
+        partition);
+      float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+      cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+           << " subsets for " << 100.0f * actual_efficiency
+           << " % efficiency"
+           << "                  \r" << flush;
+      partitions.push_back(partition);
+      if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+        break;
+      }
+      required_efficiency_to_beat = actual_efficiency;
+    }
+    cerr << "                                                                  " << endl;
+    while (true) {
+      bool repeat = false;
+      for (size_t i = 0; i < partitions.size() - 1; i++) {
+        if (partitions[i].size() >= partitions[i+1].size()) {
+          partitions.erase(partitions.begin() + i);
+          repeat = true;
+          break;
+        }
+      }
+      if (!repeat) {
+        break;
+      }
+    }
+    for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+      print_partition(preprocessed_inputfiles, *it);
+    }
+  }
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+  struct results_entry_t {
+    uint16_t product_size;
+    size_triple_t default_block_size;
+    uint16_t best_pot_block_size;
+    float default_gflops;
+    float best_pot_gflops;
+    float default_efficiency;
+  };
+  friend ostream& operator<<(ostream& s, const results_entry_t& entry)
+  {
+    return s
+      << "Product size " << size_triple_t(entry.product_size)
+      << ": default block size " << entry.default_block_size
+      << " -> " << entry.default_gflops
+      << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
+      << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
+      << " -> " << entry.best_pot_gflops
+      << " GFlop/s" << dec;
+  }
+  static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
+    return e1.default_efficiency < e2.default_efficiency;
+  }
+  virtual const char* invokation_name() const override { return "evaluate-defaults"; }
+  void show_usage_and_exit() const
+  {
+    cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
+    cerr << "checks how well the performance with default sizes compares to the best "
+         << "performance measured over all POT sizes." << endl;
+    exit(1);
+  }
+  virtual void run(const vector<string>& input_filenames) const override
+  {
+    if (input_filenames.size() != 2) {
+      show_usage_and_exit();
+    }
+    inputfile_t inputfile_default_sizes(input_filenames[0]);
+    inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
+    if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
+      cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
+      show_usage_and_exit();
+    }
+    if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
+      cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
+      show_usage_and_exit();
+    }
+    vector<results_entry_t> results;
+    vector<results_entry_t> cubic_results;
+    
+    uint16_t product_size = 0;
+    auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
+    for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
+         it_default_sizes != inputfile_default_sizes.entries.end();
+         ++it_default_sizes)
+    {
+      if (it_default_sizes->product_size == product_size) {
+        continue;
+      }
+      product_size = it_default_sizes->product_size;
+      while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
+             it_all_pot_sizes->product_size != product_size)
+      {
+        ++it_all_pot_sizes;
+      }
+      if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
+        break;
+      }
+      uint16_t best_pot_block_size = 0;
+      float best_pot_gflops = 0;
+      for (auto it = it_all_pot_sizes;
+           it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
+           ++it)
+      {
+        if (it->gflops > best_pot_gflops) {
+          best_pot_gflops = it->gflops;
+          best_pot_block_size = it->pot_block_size;
+        }
+      }
+      results_entry_t entry;
+      entry.product_size = product_size;
+      entry.default_block_size = it_default_sizes->nonpot_block_size;
+      entry.best_pot_block_size = best_pot_block_size;
+      entry.default_gflops = it_default_sizes->gflops;
+      entry.best_pot_gflops = best_pot_gflops;
+      entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
+      results.push_back(entry);
+
+      size_triple_t t(product_size);
+      if (t.k == t.m && t.m == t.n) {
+        cubic_results.push_back(entry);
+      }
+    }
+
+    cout << "All results:" << endl;
+    for (auto it = results.begin(); it != results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(results.begin(), results.end(), lower_efficiency);
+    
+    const size_t n = min<size_t>(20, results.size());
+    cout << n << " worst results:" << endl;
+    for (size_t i = 0; i < n; i++) {
+      cout << results[i] << endl;
+    }
+    cout << endl;
+
+    cout << "cubic results:" << endl;
+    for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    
+    cout.precision(2);
+    vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
+    for (auto it = a.begin(); it != a.end(); ++it) {
+      size_t n = min(results.size() - 1, size_t(*it * results.size()));
+      cout << (100.0f * n / (results.size() - 1))
+           << " % of product sizes have default efficiency <= "
+           << 100.0f * results[n].default_efficiency << " %" << endl;
+    }
+    cout.precision(default_precision);
+  }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
+  cerr << "available actions:" << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  } 
+  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+  exit(1);
+}
+
+int main(int argc, char* argv[])
+{
+  cout.precision(default_precision);
+  cerr.precision(default_precision);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new partition_action_t);
+  available_actions.emplace_back(new evaluate_defaults_action_t);
+
+  vector<string> input_filenames;
+
+  action_t* action = nullptr;
+
+  if (argc < 2) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (int i = 1; i < argc; i++) {
+    bool arg_handled = false;
+    // Step 1. Try to match action invokation names.
+    for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+      if (!strcmp(argv[i], (*it)->invokation_name())) {
+        if (!action) {
+          action = it->get();
+          arg_handled = true;
+          break;
+        } else {
+          cerr << "can't specify more than one action!" << endl;
+          show_usage_and_exit(argc, argv, available_actions);
+        }
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 2. Try to match option names.
+    if (argv[i][0] == '-') {
+      if (!strcmp(argv[i], "--only-cubic-sizes")) {
+        only_cubic_sizes = true;
+        arg_handled = true;
+      }
+      if (!strcmp(argv[i], "--dump-tables")) {
+        dump_tables = true;
+        arg_handled = true;
+      }
+      if (!arg_handled) {
+        cerr << "Unrecognized option: " << argv[i] << endl;
+        show_usage_and_exit(argc, argv, available_actions);
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 3. Default to interpreting args as input filenames.
+    input_filenames.emplace_back(argv[i]);
+  }
+
+  if (dump_tables && only_cubic_sizes) {
+    cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  if (!action) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  action->run(input_filenames);
+}
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 8222271fb..0974ebe4c 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -148,7 +148,7 @@ int main(int argc, char ** argv)
   int m = s;
   int n = s;
   int p = s;
-  int cache_size = -1;
+  int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
 
   bool need_help = false;
   for (int i=1; i<argc;)
@@ -169,7 +169,13 @@ int main(int argc, char ** argv)
       else if(argv[i][1]=='c')
       {
         ++i;
-        cache_size = atoi(argv[i++]);
+        cache_size1 = atoi(argv[i++]);
+        if(argv[i][0]!='-')
+        {
+          cache_size2 = atoi(argv[i++]);
+          if(argv[i][0]!='-')
+            cache_size3 = atoi(argv[i++]);
+        }
       }
       else if(argv[i][1]=='t')
       {
@@ -191,14 +197,14 @@ int main(int argc, char ** argv)
 
   if(need_help)
   {
-    std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
+    std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
     std::cout << "   <matrix sizes> : size\n";
     std::cout << "   <matrix sizes> : rows columns depth\n";
     return 1;
   }
 
-  if(cache_size>0)
-    setCpuCacheSizes(cache_size,96*cache_size);
+ if(cache_size1>0)
+   setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
 
   
   A a(m,p); a.setRandom();
diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp
index 398fef835..129afcfb2 100644
--- a/bench/bench_norm.cpp
+++ b/bench/bench_norm.cpp
@@ -6,19 +6,25 @@ using namespace Eigen;
 using namespace std;
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v)
 {
   return v.norm();
 }
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v)
+{
+  return v.stableNorm();
+}
+
+template<typename T>
+EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v)
 {
   return v.hypotNorm();
 }
 
 template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v)
 {
   return v.blueNorm();
 }
@@ -217,20 +223,21 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
 }
 
 #define BENCH_PERF(NRM) { \
+  float af = 0; double ad = 0; std::complex<float> ac = 0; \
   Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
   for (int k=0; k<tries; ++k) { \
     tf.start(); \
-    for (int i=0; i<iters; ++i) NRM(vf); \
+    for (int i=0; i<iters; ++i) { af += NRM(vf); } \
     tf.stop(); \
   } \
   for (int k=0; k<tries; ++k) { \
     td.start(); \
-    for (int i=0; i<iters; ++i) NRM(vd); \
+    for (int i=0; i<iters; ++i) { ad += NRM(vd); } \
     td.stop(); \
   } \
   /*for (int k=0; k<std::max(1,tries/3); ++k) { \
     tcf.start(); \
-    for (int i=0; i<iters; ++i) NRM(vcf); \
+    for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \
     tcf.stop(); \
   } */\
   std::cout << #NRM << "\t" << tf.value() << "   " << td.value() <<  "    " << tcf.value() << "\n"; \
@@ -316,14 +323,17 @@ int main(int argc, char** argv)
     std::cout << "\n";
   }
 
+  y = 1;
   std::cout.precision(4);
-  std::cerr << "Performance (out of cache):\n";
+  int s1 = 1024*1024*32;
+  std::cerr << "Performance (out of cache, " << s1 << "):\n";
   {
     int iters = 1;
-    VectorXf vf = VectorXf::Random(1024*1024*32) * y;
-    VectorXd vd = VectorXd::Random(1024*1024*32) * y;
-    VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y;
+    VectorXf vf = VectorXf::Random(s1) * y;
+    VectorXd vd = VectorXd::Random(s1) * y;
+    VectorXcf vcf = VectorXcf::Random(s1) * y;
     BENCH_PERF(sqsumNorm);
+    BENCH_PERF(stableNorm);
     BENCH_PERF(blueNorm);
     BENCH_PERF(pblueNorm);
     BENCH_PERF(lapackNorm);
@@ -332,13 +342,14 @@ int main(int argc, char** argv)
     BENCH_PERF(bl2passNorm);
   }
 
-  std::cerr << "\nPerformance (in cache):\n";
+  std::cerr << "\nPerformance (in cache, " << 512 << "):\n";
   {
     int iters = 100000;
     VectorXf vf = VectorXf::Random(512) * y;
     VectorXd vd = VectorXd::Random(512) * y;
     VectorXcf vcf = VectorXcf::Random(512) * y;
     BENCH_PERF(sqsumNorm);
+    BENCH_PERF(stableNorm);
     BENCH_PERF(blueNorm);
     BENCH_PERF(pblueNorm);
     BENCH_PERF(lapackNorm);
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
new file mode 100644
index 000000000..827be2880
--- /dev/null
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -0,0 +1,677 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <fstream>
+#include <memory>
+#include <cstdio>
+
+bool eigen_use_specific_block_size;
+int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
+#include <Eigen/Core>
+
+#include <bench/BenchTimer.h>
+
+using namespace Eigen;
+using namespace std;
+
+static BenchTimer timer;
+
+// how many times we repeat each measurement.
+// measurements are randomly shuffled - we're not doing
+// all N identical measurements in a row.
+const int measurement_repetitions = 3;
+
+// Timings below this value are too short to be accurate,
+// we'll repeat measurements with more iterations until
+// we get a timing above that threshold.
+const float min_accurate_time = 1e-2f;
+
+// See --min-working-set-size command line parameter.
+size_t min_working_set_size = 0;
+
+float max_clock_speed = 0.0f;
+
+// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
+const size_t maxsize = 2048;
+const size_t minsize = 16;
+
+typedef MatrixXf MatrixType;
+typedef MatrixType::Scalar Scalar;
+typedef internal::packet_traits<Scalar>::type Packet;
+
+static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
+static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
+static_assert(maxsize > minsize, "maxsize must be larger than minsize");
+static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  size_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+};
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+// Convert between size tripes and a compact form fitting in 12 bits
+// where each size, which must be a POT, is encoded as its log2, on 4 bits
+// so the largest representable size is 2^15 == 32k  ... big enough.
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+uint16_t compact_size_triple(const size_triple_t& t)
+{
+  return compact_size_triple(t.k, t.m, t.n);
+}
+
+// A single benchmark. Initially only contains benchmark params.
+// Then call run(), which stores the result in the gflops field.
+struct benchmark_t
+{
+  uint16_t compact_product_size;
+  uint16_t compact_block_size;
+  bool use_default_block_size;
+  float gflops;
+  benchmark_t()
+    : compact_product_size(0)
+    , compact_block_size(0)
+    , use_default_block_size(false)
+    , gflops(0)
+  {
+  }
+  benchmark_t(size_t pk, size_t pm, size_t pn,
+              size_t bk, size_t bm, size_t bn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(compact_size_triple(bk, bm, bn))
+    , use_default_block_size(false)
+    , gflops(0)
+  {}
+  benchmark_t(size_t pk, size_t pm, size_t pn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(0)
+    , use_default_block_size(true)
+    , gflops(0)
+  {}
+
+  void run();
+};
+
+ostream& operator<<(ostream& s, const benchmark_t& b)
+{
+  s << hex << b.compact_product_size << dec;
+  if (b.use_default_block_size) {
+    size_triple_t t(b.compact_product_size);
+    Index k = t.k, m = t.m, n = t.n;
+    internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
+    s << " default(" << k << ", " << m << ", " << n << ")";
+  } else {
+    s << " " << hex << b.compact_block_size << dec;
+  }
+  s << " " << b.gflops;
+  return s;
+}
+
+// We sort first by increasing benchmark parameters,
+// then by decreasing performance.
+bool operator<(const benchmark_t& b1, const benchmark_t& b2)
+{ 
+  return b1.compact_product_size < b2.compact_product_size ||
+           (b1.compact_product_size == b2.compact_product_size && (
+             (b1.compact_block_size < b2.compact_block_size || (
+               b1.compact_block_size == b2.compact_block_size &&
+                 b1.gflops > b2.gflops))));
+}
+
+void benchmark_t::run()
+{
+  size_triple_t productsizes(compact_product_size);
+
+  if (use_default_block_size) {
+    eigen_use_specific_block_size = false;
+  } else {
+    // feed eigen with our custom blocking params
+    eigen_use_specific_block_size = true;
+    size_triple_t blocksizes(compact_block_size);
+    eigen_block_size_k = blocksizes.k;
+    eigen_block_size_m = blocksizes.m;
+    eigen_block_size_n = blocksizes.n;
+  }
+
+  // set up the matrix pool
+
+  const size_t combined_three_matrices_sizes =
+    sizeof(Scalar) *
+      (productsizes.k * productsizes.m +
+       productsizes.k * productsizes.n +
+       productsizes.m * productsizes.n);
+
+  // 64 M is large enough that nobody has a cache bigger than that,
+  // while still being small enough that everybody has this much RAM,
+  // so conveniently we don't need to special-case platforms here.
+  const size_t unlikely_large_cache_size = 64 << 20;
+
+  const size_t working_set_size =
+    min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
+
+  const size_t matrix_pool_size =
+    1 + working_set_size / combined_three_matrices_sizes;
+
+  MatrixType *lhs = new MatrixType[matrix_pool_size];
+  MatrixType *rhs = new MatrixType[matrix_pool_size];
+  MatrixType *dst = new MatrixType[matrix_pool_size];
+  
+  for (size_t i = 0; i < matrix_pool_size; i++) {
+    lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
+    rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
+    dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
+  }
+
+  // main benchmark loop
+
+  int iters_at_a_time = 1;
+  float time_per_iter = 0.0f;
+  size_t matrix_index = 0;
+  while (true) {
+
+    double starttime = timer.getCpuTime();
+    for (int i = 0; i < iters_at_a_time; i++) {
+      dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
+      matrix_index++;
+      if (matrix_index == matrix_pool_size) {
+        matrix_index = 0;
+      }
+    }
+    double endtime = timer.getCpuTime();
+
+    const float timing = float(endtime - starttime);
+
+    if (timing >= min_accurate_time) {
+      time_per_iter = timing / iters_at_a_time;
+      break;
+    }
+
+    iters_at_a_time *= 2;
+  }
+
+  delete[] lhs;
+  delete[] rhs;
+  delete[] dst;
+
+  gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
+}
+
+void print_cpuinfo()
+{
+#ifdef __linux__
+  cout << "contents of /proc/cpuinfo:" << endl;
+  string line;
+  ifstream cpuinfo("/proc/cpuinfo");
+  if (cpuinfo.is_open()) {
+    while (getline(cpuinfo, line)) {
+      cout << line << endl;
+    }
+    cpuinfo.close();
+  }
+  cout << endl;
+#elif defined __APPLE__
+  cout << "output of sysctl hw:" << endl;
+  system("sysctl hw");
+  cout << endl;
+#endif
+}
+
+template <typename T>
+string type_name()
+{
+  return "unknown";
+}
+
+template<>
+string type_name<float>()
+{
+  return "float";
+}
+
+template<>
+string type_name<double>()
+{
+  return "double";
+}
+
+struct action_t
+{
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run() const { abort(); }
+  virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int /*argc*/, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+  cerr << "available actions:" << endl << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  }
+  cerr << endl;
+  cerr << "options:" << endl << endl;
+  cerr << "  --min-working-set-size=N:" << endl;
+  cerr << "       Set the minimum working set size to N bytes." << endl;
+  cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
+  cerr << "       A larger working set lowers the chance of a warm cache." << endl;
+  cerr << "       The default value 0 means use a large enough working" << endl;
+  cerr << "       set to likely outsize caches." << endl;
+  cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
+  cerr << "       avoid warm caches." << endl;
+  exit(1);
+}
+     
+float measure_clock_speed()
+{
+  cerr << "Measuring clock speed...                              \r" << flush;
+          
+  vector<float> all_gflops;
+  for (int i = 0; i < 8; i++) {
+    benchmark_t b(1024, 1024, 1024);
+    b.run();
+    all_gflops.push_back(b.gflops);
+  }
+
+  sort(all_gflops.begin(), all_gflops.end());
+  float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
+
+  // multiply by an arbitrary constant to discourage trying doing anything with the
+  // returned values besides just comparing them with each other.
+  float result = stable_estimate * 123.456f;
+
+  return result;
+}
+
+struct human_duration_t
+{
+  int seconds;
+  human_duration_t(int s) : seconds(s) {}
+};
+
+ostream& operator<<(ostream& s, const human_duration_t& d)
+{
+  int remainder = d.seconds;
+  if (remainder > 3600) {
+    int hours = remainder / 3600;
+    s << hours << " h ";
+    remainder -= hours * 3600;
+  }
+  if (remainder > 60) {
+    int minutes = remainder / 60;
+    s << minutes << " min ";
+    remainder -= minutes * 60;
+  }
+  if (d.seconds < 600) {
+    s << remainder << " s";
+  }
+  return s;
+}
+
+const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
+
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "w");
+  if (!file) {
+    cerr << "Could not open file " << filename << " for writing." << endl;
+    cerr << "Do you have write permissions on the current working directory?" << endl;
+    exit(1);
+  }
+  size_t benchmarks_vector_size = benchmarks.size();
+  fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
+  fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
+  fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
+  fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
+  fclose(file);
+}
+
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return false;
+  }
+  if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
+    return false;
+  }
+  size_t benchmarks_vector_size = 0;
+  if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
+    return false;
+  }
+  if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
+    return false;
+  }
+  benchmarks.resize(benchmarks_vector_size);
+  if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
+    return false;
+  }
+  unlink(filename);
+  return true;
+}
+
+void try_run_some_benchmarks(
+  vector<benchmark_t>& benchmarks,
+  double time_start,
+  size_t& first_benchmark_to_run)
+{
+  if (first_benchmark_to_run == benchmarks.size()) {
+    return;
+  }
+
+  double time_last_progress_update = 0;
+  double time_last_clock_speed_measurement = 0;
+  double time_now = 0;
+
+  size_t benchmark_index = first_benchmark_to_run;
+
+  while (true) {
+    float ratio_done = float(benchmark_index) / benchmarks.size();
+    time_now = timer.getRealTime();
+
+    // We check clock speed every minute and at the end.
+    if (benchmark_index == benchmarks.size() ||
+        time_now > time_last_clock_speed_measurement + 60.0f)
+    {
+      time_last_clock_speed_measurement = time_now;
+
+      // Ensure that clock speed is as expected
+      float current_clock_speed = measure_clock_speed();
+
+      // The tolerance needs to be smaller than the relative difference between
+      // clock speeds that a device could operate under.
+      // It seems unlikely that a device would be throttling clock speeds by
+      // amounts smaller than 2%.
+      // With a value of 1%, I was getting within noise on a Sandy Bridge.
+      const float clock_speed_tolerance = 0.02f;
+
+      if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
+        // Clock speed is now higher than we previously measured.
+        // Either our initial measurement was inaccurate, which won't happen
+        // too many times as we are keeping the best clock speed value and
+        // and allowing some tolerance; or something really weird happened,
+        // which invalidates all benchmark results collected so far.
+        // Either way, we better restart all over again now.
+        if (benchmark_index) {
+          cerr << "Restarting at " << 100.0f * ratio_done
+               << " % because clock speed increased.          " << endl;
+        }
+        max_clock_speed = current_clock_speed;
+        first_benchmark_to_run = 0;
+        return;
+      }
+
+      bool rerun_last_tests = false;
+
+      if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+        cerr << "Measurements completed so far: "
+             << 100.0f * ratio_done
+             << " %                             " << endl;
+        cerr << "Clock speed seems to be only "
+             << current_clock_speed/max_clock_speed
+             << " times what it used to be." << endl;
+
+        unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
+
+        while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+          if (seconds_to_sleep_if_lower_clock_speed > 32) {
+            cerr << "Sleeping longer probably won't make a difference." << endl;
+            cerr << "Serializing benchmarks to " << session_filename << endl;
+            serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
+            cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
+            exit(2);
+          }
+          rerun_last_tests = true;
+          cerr << "Sleeping "
+               << seconds_to_sleep_if_lower_clock_speed
+               << " s...                                   \r" << endl;
+          sleep(seconds_to_sleep_if_lower_clock_speed);
+          current_clock_speed = measure_clock_speed();
+          seconds_to_sleep_if_lower_clock_speed *= 2;
+        }
+      }
+
+      if (rerun_last_tests) {
+        cerr << "Redoing the last "
+             << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+             << " % because clock speed had been low.   " << endl;
+        return;
+      }
+
+      // nothing wrong with the clock speed so far, so there won't be a need to rerun
+      // benchmarks run so far in case we later encounter a lower clock speed.
+      first_benchmark_to_run = benchmark_index;
+    }
+
+    if (benchmark_index == benchmarks.size()) {
+      // We're done!
+      first_benchmark_to_run = benchmarks.size();
+      // Erase progress info
+      cerr << "                                                            " << endl;
+      return;
+    }
+
+    // Display progress info on stderr
+    if (time_now > time_last_progress_update + 1.0f) {
+      time_last_progress_update = time_now;
+      cerr << "Measurements... " << 100.0f * ratio_done
+           << " %, ETA "
+           << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
+           << "                          \r" << flush;
+    }
+
+    // This is where we actually run a benchmark!
+    benchmarks[benchmark_index].run();
+    benchmark_index++;
+  }
+}
+
+void run_benchmarks(vector<benchmark_t>& benchmarks)
+{
+  size_t first_benchmark_to_run;
+  vector<benchmark_t> deserialized_benchmarks;
+  bool use_deserialized_benchmarks = false;
+  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
+    cerr << "Found serialized session with "
+         << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+         << " % already done" << endl;
+    if (deserialized_benchmarks.size() == benchmarks.size() &&
+        first_benchmark_to_run > 0 &&
+        first_benchmark_to_run < benchmarks.size())
+    {
+      use_deserialized_benchmarks = true;
+    }
+  }
+
+  if (use_deserialized_benchmarks) {
+    benchmarks = deserialized_benchmarks;
+  } else {
+    // not using deserialized benchmarks, starting from scratch
+    first_benchmark_to_run = 0;
+
+    // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+    // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+    // It also means that if data is corrupted for some time span, the odds are that
+    // not all repetitions of a given benchmark will be corrupted.
+    random_shuffle(benchmarks.begin(), benchmarks.end());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    max_clock_speed = max(max_clock_speed, measure_clock_speed());
+  }
+  
+  double time_start = 0.0;
+  while (first_benchmark_to_run < benchmarks.size()) {
+    if (first_benchmark_to_run == 0) {
+      time_start = timer.getRealTime();
+    }
+    try_run_some_benchmarks(benchmarks,
+                            time_start,
+                            first_benchmark_to_run);
+  }
+
+  // Sort timings by increasing benchmark parameters, and decreasing gflops.
+  // The latter is very important. It means that we can ignore all but the first
+  // benchmark with given parameters.
+  sort(benchmarks.begin(), benchmarks.end());
+
+  // Collect best (i.e. now first) results for each parameter values.
+  vector<benchmark_t> best_benchmarks;
+  for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+    if (best_benchmarks.empty() ||
+        best_benchmarks.back().compact_product_size != it->compact_product_size ||
+        best_benchmarks.back().compact_block_size != it->compact_block_size)
+    {
+      best_benchmarks.push_back(*it);
+    }
+  }
+
+  // keep and return only the best benchmarks
+  benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "all-pot-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+              for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+                for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+                  benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "default-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            benchmarks.emplace_back(ksize, msize, nsize);
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  double time_start = timer.getRealTime();
+  cout.precision(4);
+  cerr.precision(4);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+  available_actions.emplace_back(new measure_default_sizes_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 1) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  for (int i = 2; i < argc; i++) {
+    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+      const char* equals_sign = strchr(argv[i], '=');
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+    } else {
+      cerr << "unrecognized option: " << argv[i] << endl << endl;
+      show_usage_and_exit(argc, argv, available_actions);
+    }
+  }
+
+  print_cpuinfo();
+
+  cout << "benchmark parameters:" << endl;
+  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "scalar type: " << type_name<Scalar>() << endl;
+  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+  cout << "minsize = " << minsize << endl;
+  cout << "maxsize = " << maxsize << endl;
+  cout << "measurement_repetitions = " << measurement_repetitions << endl;
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
+    cout << " (try to outsize caches)";
+  }
+  cout << endl << endl;
+
+  (*action)->run();
+
+  double time_end = timer.getRealTime();
+  cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
+}
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index b299d9899..9444b450c 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -97,6 +97,7 @@ ENABLE_TESTING()
 
 add_subdirectory(libs/eigen3)
 add_subdirectory(libs/eigen2)
+add_subdirectory(libs/tensors)
 add_subdirectory(libs/BLAS)
 add_subdirectory(libs/ublas)
 add_subdirectory(libs/gmm)
diff --git a/bench/btl/cmake/FindACML.cmake b/bench/btl/cmake/FindACML.cmake
index f45ae1b0d..4989fa2f4 100644
--- a/bench/btl/cmake/FindACML.cmake
+++ b/bench/btl/cmake/FindACML.cmake
@@ -17,6 +17,7 @@ find_file(ACML_LIBRARIES
   libacml_mp.so
   PATHS
   /usr/lib
+  /usr/lib64
   $ENV{ACMLDIR}/lib
   ${LIB_INSTALL_DIR}
 )
@@ -35,6 +36,7 @@ if(NOT ACML_LIBRARIES)
         libacml.so libacml_mv.so
         PATHS
         /usr/lib
+        /usr/lib64
         $ENV{ACMLDIR}/lib
         ${LIB_INSTALL_DIR}
         )
diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake
index 14b1dee09..4136a989d 100644
--- a/bench/btl/cmake/FindATLAS.cmake
+++ b/bench/btl/cmake/FindATLAS.cmake
@@ -3,18 +3,13 @@ if (ATLAS_LIBRARIES)
   set(ATLAS_FIND_QUIETLY TRUE)
 endif (ATLAS_LIBRARIES)
 
-find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LAPACK NAMES liblapack_atlas.so.3 liblapack.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LAPACK NAMES lapack_atlas lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-if(NOT ATLAS_LAPACK)
-  find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-  find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-endif(NOT ATLAS_LAPACK)
-
-find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
 if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
diff --git a/bench/btl/cmake/FindCBLAS.cmake b/bench/btl/cmake/FindCBLAS.cmake
index 554f0291b..ce0f2f2b2 100644
--- a/bench/btl/cmake/FindCBLAS.cmake
+++ b/bench/btl/cmake/FindCBLAS.cmake
@@ -23,6 +23,7 @@ find_file(CBLAS_LIBRARIES
   libcblas.so.3
   PATHS
   /usr/lib
+  /usr/lib64
   $ENV{CBLASDIR}/lib
   ${LIB_INSTALL_DIR}
 )
diff --git a/bench/btl/cmake/FindOPENBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake
index c76fc251c..2a0919436 100644
--- a/bench/btl/cmake/FindOPENBLAS.cmake
+++ b/bench/btl/cmake/FindOPENBLAS.cmake
@@ -3,7 +3,7 @@ if (OPENBLAS_LIBRARIES)
   set(OPENBLAS_FIND_QUIETLY TRUE)
 endif (OPENBLAS_LIBRARIES)
 
-find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
+find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
 find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
 
 if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 47fe58135..1deabdae2 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -47,7 +47,7 @@ public :
   {
     #if defined(EIGEN_VECTORIZE_SSE)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
-    #elif defined(EIGEN_VECTORIZE_ALTIVEC)
+    #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
     #else
     if (SIZE==Dynamic) return "eigen2_novec"; else return "tiny_eigen2_novec";
diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt
new file mode 100644
index 000000000..09d6d8e43
--- /dev/null
+++ b/bench/btl/libs/tensors/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+
+if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
+  # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
+  set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
+  set(TENSOR_FOUND TRUE)
+else()
+  find_package(Tensor)
+endif()
+
+if (TENSOR_FOUND)
+
+  include_directories(${TENSOR_INCLUDE_DIR})
+  btl_add_bench(btl_tensor_linear main_linear.cpp)
+  btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
+  btl_add_bench(btl_tensor_matmat main_matmat.cpp)
+
+  btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+
+  option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
+  if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
+    btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
+    btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
+    btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
+
+    btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+  endif()
+
+
+  if(NOT BTL_NOVEC)
+    btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
+    btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
+    btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
+    btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+
+  endif(NOT BTL_NOVEC)
+
+endif (TENSOR_FOUND)
diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp
new file mode 100644
index 000000000..e257f1e72
--- /dev/null
+++ b/bench/btl/libs/tensors/main_linear.cpp
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+  bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp
new file mode 100644
index 000000000..675fcfc6d
--- /dev/null
+++ b/bench/btl/libs/tensors/main_matmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp
new file mode 100644
index 000000000..1af00c81b
--- /dev/null
+++ b/bench/btl/libs/tensors/main_vecmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh
new file mode 100644
index 000000000..97b8e0f0b
--- /dev/null
+++ b/bench/btl/libs/tensors/tensor_interface.hh
@@ -0,0 +1,105 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#ifndef TENSOR_INTERFACE_HH
+#define TENSOR_INTERFACE_HH
+
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+#include "btl.hh"
+
+using namespace Eigen;
+
+template<class real>
+class tensor_interface
+{
+public :
+  typedef real real_type;
+  typedef typename Eigen::Tensor<real,2>::Index Index;
+
+  typedef std::vector<real> stl_vector;
+  typedef std::vector<stl_vector> stl_matrix;
+
+  typedef Eigen::Tensor<real,2> gene_matrix;
+  typedef Eigen::Tensor<real,1> gene_vector;
+
+
+  static inline std::string name( void )
+  {
+    return EIGEN_MAKESTRING(BTL_PREFIX);
+  }
+
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
+
+  static void free_vector(gene_vector & /*B*/) {}
+
+  static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
+    A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
+
+    for (unsigned int j=0; j<A_stl.size() ; j++){
+      for (unsigned int i=0; i<A_stl[j].size() ; i++){
+        A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
+      }
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_from_stl(gene_vector & B, stl_vector & B_stl){
+    B.resize(B_stl.size());
+
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B.coeffRef(i) = B_stl[i];
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_to_stl(gene_vector & B, stl_vector & B_stl){
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B_stl[i] = B.coeff(i);
+    }
+  }
+
+  static BTL_DONT_INLINE  void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
+    int  N=A_stl.size();
+
+    for (int j=0;j<N;j++){
+      A_stl[j].resize(N);
+      for (int i=0;i<N;i++){
+        A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
+      }
+    }
+  }
+
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int  /*N*/){
+    Y += X.constant(coef) * X;
+  }
+
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int  /*N*/){
+    Y = X.constant(a)*X + Y.constant(b)*Y;
+  }
+
+  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int  /*N*/){
+    cible = source;
+  }
+
+  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int  /*N*/){
+    cible = source;
+  }
+};
+
+#endif
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
new file mode 100644
index 000000000..40a71c781
--- /dev/null
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -0,0 +1,45 @@
+#3.0.1
+#3.1.1
+#3.2.0
+3.2.4
+#5745:37f59e65eb6c
+5891:d8652709345d  # introduce AVX
+#5893:24b4dc92c6d3  # merge
+5895:997c2ef9fc8b  # introduce FMA
+#5904:e1eafd14eaa1  # complex and AVX
+5908:f8ee3c721251  # improve packing with ptranspose
+#5921:ca808bb456b0  # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2  # merge
+#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483  # merge
+before-evaluators
+#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f  # Properly detect FMA support on ARM
+#6677:700e023044e7   # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1   # merge default to tensors
+#6726:ff2d2388e7b9   # merge default to tensors
+#6742:0cbd6195e829   # merge default to tensors
+#6747:853d2bafeb8f   # Generalized the gebp apis
+6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
+#6781:9cc5a931b2c6   # generalized gemv
+#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c   # merge tensor
+6845:7333ed40c6ef   # change prefetching in gebp
+#6856:b5be5e10eb7f   # merge index conversion
+#6893:c3a64aba7c70   # clean blocking size computation
+#6898:6fb31ebe6492   # rotating kernel for ARM
+6899:877facace746   # rotating kernel for ARM only
+#6904:c250623ae9fa   # result_of
+6921:915f1b1fc158   # fix prefetching change for ARM
+6923:9ff25f6dacc6   # prefetching
+6933:52572e60b5d3   # blocking size strategy
+6937:c8c042f286b2   # avoid redundant pack_rhs
+6981:7e5d6f78da59   # dynamic loop swapping
+6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+
diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp
new file mode 100644
index 000000000..72eb9cab6
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm.cpp
@@ -0,0 +1,67 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
+
+EIGEN_DONT_INLINE
+void gemm(const Mat &A, const Mat &B, Mat &C)
+{
+  C.noalias() += A * B;
+}
+
+EIGEN_DONT_INLINE
+double bench(long m, long n, long k)
+{
+  Mat A(m,k);
+  Mat B(k,n);
+  Mat C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+  
+  BenchTimer t;
+  
+  double up = 1e8*4/sizeof(Scalar);
+  double tm0 = 4, tm1 = 10;
+  if(NumTraits<Scalar>::IsComplex)
+  {
+    up /= 4;
+    tm0 = 2;
+    tm1 = 4;
+  }
+  
+  double flops = 2. * m * n * k;
+  long rep = std::max(1., std::min(100., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+  
+  BENCH(t, tries, rep, gemm(A,B,C));
+  
+  return 1e-9 * rep * flops / t.best();
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("settings.txt");
+  long m, n, k;
+  while(settings >> m >> n >> k)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench(m, n, k) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
new file mode 100755
index 000000000..609c471f9
--- /dev/null
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# base name of the bench
+# it reads $1.out
+# and generates $1.pdf
+WHAT=$1
+
+header="rev "
+while read line
+do
+  if [ ! -z '$line' ]; then
+    header="$header  \"$line\""
+  fi
+done < settings.txt
+
+echo $header > $WHAT.out.header
+cat $WHAT.out >> $WHAT.out.header
+
+
+echo "set title '$WHAT'" > $WHAT.gnuplot
+echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
+echo "set xtics rotate 1" >> $WHAT.gnuplot
+
+echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
+echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
+
+col=`cat settings.txt | wc -l`
+echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
+echo " " >>  $WHAT.gnuplot
+
+gnuplot -persist < $WHAT.gnuplot
+
+# generate a png file
+# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten  .$WHAT.png
+
+# clean
+rm $WHAT.out.header $WHAT.gnuplot
+\ No newline at end of file
diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh
new file mode 100755
index 000000000..3fa6a3661
--- /dev/null
+++ b/bench/perf_monitoring/gemm/run_gemm.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+# Examples of environment variables to be set:
+#   PREFIX="haswell-fma-"
+#   CXX_FLAGS="-mfma"
+
+# Options:
+#   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
+
+
+if echo "$*" | grep '\-up' > /dev/null; then
+  update=true
+else
+  update=false
+fi
+
+if echo "$*" | grep '\-s' > /dev/null; then
+  selected=true
+else
+  selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
+ echo "(Re-)Compute all changesets and keep bests"
+else
+ echo "Skip previously computed changesets"
+fi
+
+
+
+if [ ! -d "eigen_src" ]; then
+  hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+  cd eigen_src
+  hg pull -u
+  cd ..
+fi
+
+if [ ! -z '$CXX' ]; then
+  CXX=g++
+fi
+
+function make_backup
+{
+  if [ -f "$1.out" ]; then
+    mv "$1.out" "$1.backup"
+  fi
+}
+
+function merge
+{
+  count1=`echo $1 |  wc -w`
+  count2=`echo $2 |  wc -w`
+  
+  if [ $count1 == $count2 ]; then
+    a=( $1 ); b=( $2 )
+    res=""
+    for (( i=0 ; i<$count1 ; i++ )); do
+      ai=${a[$i]}; bi=${b[$i]}
+      tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
+      res="$res $tmp"
+    done
+    echo $res
+
+  else
+    echo $1
+  fi
+}
+
+function test_current 
+{
+  rev=$1
+  scalar=$2
+  name=$3
+  
+  prev=""
+  if [ -e "$name.backup" ]; then
+    prev=`grep $rev "$name.backup" | cut -c 14-`
+  fi
+  res=$prev
+  count_rev=`echo $prev |  wc -w`
+  count_ref=`cat "settings.txt" |  wc -l`
+  if echo "$global_args" | grep "$rev" > /dev/null; then
+    rev_found=true
+  else
+    rev_found=false
+  fi
+#  echo $update et $selected et $rev_found because $rev et "$global_args"
+#  echo $count_rev et $count_ref
+  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
+    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
+      curr=`./$name`
+      if [ $count_rev == $count_ref ]; then
+        echo "merge previous $prev"
+        echo "with new       $curr"
+      else
+        echo "got            $curr"
+      fi
+      res=`merge "$curr" "$prev"`
+#       echo $res
+      echo "$rev $res" >> $name.out
+    else
+      echo "Compilation failed, skip rev $rev"
+    fi
+  else
+    echo "Skip existing results for $rev / $name"
+    echo "$rev $res" >> $name.out
+  fi
+}
+
+make_backup $PREFIX"sgemm"
+make_backup $PREFIX"dgemm"
+make_backup $PREFIX"cgemm"
+
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
+do
+  if [ ! -z '$rev' ]; then
+    echo "Testing rev $rev"
+    cd eigen_src
+    hg up -C $rev > /dev/null
+    actual_rev=`hg identify | cut -f1 -d' '`
+    cd ..
+    
+    test_current $actual_rev float                  $PREFIX"sgemm"
+    test_current $actual_rev double                 $PREFIX"dgemm"
+    test_current $actual_rev "std::complex<double>" $PREFIX"cgemm"
+  fi
+  
+done
+
+echo "Float:"
+cat $PREFIX"sgemm.out"
+echo ""
+
+echo "Double:"
+cat $PREFIX"dgemm.out"
+echo ""
+
+echo "Complex:"
+cat $PREFIX"cgemm.out"
+echo ""
+
+./make_plot.sh $PREFIX"sgemm"
+./make_plot.sh $PREFIX"dgemm"
+./make_plot.sh $PREFIX"cgemm"
+
+
diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt
new file mode 100644
index 000000000..5c43e1c7d
--- /dev/null
+++ b/bench/perf_monitoring/gemm/settings.txt
@@ -0,0 +1,15 @@
+8 8 8
+9 9 9
+24 24 24
+239 239 239
+240 240 240
+2400 24 24
+24 2400 24
+24 24 2400
+24 2400 2400
+2400 24 2400
+2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
+2400 2400 2400
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
new file mode 100644
index 000000000..525b9acda
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks.h
@@ -0,0 +1,305 @@
+#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+
+typedef int TensorIndex;
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "testing/base/public/benchmark.h"
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+// TODO(bsteiner): also templatize on the input type since we have users
+// for int8 as well as floats.
+template <typename Device> class BenchmarkSuite {
+ public:
+  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
+      : m_(m), k_(k), n_(n), device_(device) {
+    initialize();
+  }
+
+  BenchmarkSuite(const Device& device, size_t m)
+      : m_(m), k_(m), n_(m), device_(device) {
+    initialize();
+  }
+
+  ~BenchmarkSuite() {
+    device_.deallocate(a_);
+    device_.deallocate(b_);
+    device_.deallocate(c_);
+  }
+
+  void memcpy(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void random(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = C.random();
+    }
+    // Record the number of random numbers generated per second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void slicing(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+    // Record the number of values copied from the rhs slice to the lhs slice
+    // each second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void shuffling(int num_iters) {
+    eigen_assert(m_ == n_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    const Eigen::array<int, 2> shuffle(1, 0);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+    // Record the number of values shuffled from A and copied to B each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+ void padding(int num_iters) {
+    eigen_assert(m_ == k_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
+    paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
+    paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+ void striding(int num_iters) {
+    eigen_assert(m_ == k_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    const Eigen::array<TensorIndex, 2> strides(1, 2);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+  void broadcasting(int num_iters) {
+    const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
+
+#if defined(__CUDACC__)
+    // nvcc doesn't support cxx11
+    const Eigen::array<int, 2> broadcast(1, n_);
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
+    broadcast.set(1, n_);
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+    // Record the number of values broadcasted from A and copied to C each second
+    finalizeBenchmark(m_ * n_ * num_iters);
+  }
+
+  void coeffWiseOp(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
+    }
+    // Record the number of FLOP executed per second (2 multiplications and
+    // 1 addition per value)
+    finalizeBenchmark(3 * m_ * m_ * num_iters);
+  }
+
+  void algebraicFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void transcendentalFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  // Simple reduction
+  void reduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size(k_, n_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size(n_);
+    TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+
+    const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  // do a contraction which is equivalent to a matrix multiplication
+  void contraction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
+    const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
+    const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<float, 2>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+  }
+
+  void convolution(int num_iters, int kernel_x, int kernel_y) {
+    const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
+    const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+    const Eigen::array<TensorIndex, 2> result_sizes(
+        m_ - kernel_x + 1, n_ - kernel_y + 1);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
+    Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+    }
+    // Record the number of FLOP executed per second (kernel_size
+    // multiplications and additions for each value in the resulting tensor)
+    finalizeBenchmark(
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+  }
+
+ private:
+  void initialize() {
+    a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
+    b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
+    c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
+
+    // Initialize the content of the memory pools to prevent asan from
+    // complaining.
+    device_.memset(a_, 12, m_ * k_ * sizeof(float));
+    device_.memset(b_, 23, k_ * n_ * sizeof(float));
+    device_.memset(c_, 31, m_ * n_ * sizeof(float));
+
+    BenchmarkUseRealTime();
+  }
+
+  inline void finalizeBenchmark(int64 num_items) {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      device_.synchronize();
+    }
+#endif
+    StopBenchmarkTiming();
+    SetBenchmarkItemsProcessed(num_items);
+  }
+
+
+  size_t m_;
+  size_t k_;
+  size_t n_;
+  float* a_;
+  float* b_;
+  float* c_;
+  Device device_;
+};
+#endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
new file mode 100644
index 000000000..68653ba15
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -0,0 +1,156 @@
+#define EIGEN_USE_THREADS
+
+#include "base/sysinfo.h"
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+#include "thread/threadpool.h"
+
+#ifdef __ANDROID__
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPoolDevice device(threads);
+#else
+#define CREATE_THREAD_POOL(threads)             \
+ThreadPool tp(threads);                         \
+tp.StartWorkers();                              \
+Eigen::ThreadPoolDevice device(&tp, threads);
+#endif
+
+// Simple functions
+#define BM_FuncCPU(FUNC, THREADS)                                \
+  static void BM_##FUNC##_##THREADS##T(int iters, int N) {       \
+    StopBenchmarkTiming();                                       \
+    CREATE_THREAD_POOL(THREADS);                                 \
+    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
+    suite.FUNC(iters);                                           \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));    \
+  }                                                              \
+  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
+
+BM_FuncCPU(memcpy, 4);
+BM_FuncCPU(memcpy, 8);
+BM_FuncCPU(memcpy, 12);
+
+BM_FuncCPU(random, 4);
+BM_FuncCPU(random, 8);
+BM_FuncCPU(random, 12);
+
+BM_FuncCPU(slicing, 4);
+BM_FuncCPU(slicing, 8);
+BM_FuncCPU(slicing, 12);
+
+BM_FuncCPU(shuffling, 4);
+BM_FuncCPU(shuffling, 8);
+BM_FuncCPU(shuffling, 12);
+
+BM_FuncCPU(padding, 4);
+BM_FuncCPU(padding, 8);
+BM_FuncCPU(padding, 12);
+
+BM_FuncCPU(striding, 4);
+BM_FuncCPU(striding, 8);
+BM_FuncCPU(striding, 12);
+
+BM_FuncCPU(broadcasting, 4);
+BM_FuncCPU(broadcasting, 8);
+BM_FuncCPU(broadcasting, 12);
+
+BM_FuncCPU(coeffWiseOp, 4);
+BM_FuncCPU(coeffWiseOp, 8);
+BM_FuncCPU(coeffWiseOp, 12);
+
+BM_FuncCPU(algebraicFunc, 4);
+BM_FuncCPU(algebraicFunc, 8);
+BM_FuncCPU(algebraicFunc, 12);
+
+BM_FuncCPU(transcendentalFunc, 4);
+BM_FuncCPU(transcendentalFunc, 8);
+BM_FuncCPU(transcendentalFunc, 12);
+
+BM_FuncCPU(reduction, 4);
+BM_FuncCPU(reduction, 8);
+BM_FuncCPU(reduction, 12);
+
+
+// Contractions
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                     \
+  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
+    StopBenchmarkTiming();                                                     \
+    if (THREADS == 1) {                                                        \
+      Eigen::DefaultDevice device;                                             \
+      BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3);          \
+      suite.FUNC(iters);                                                       \
+    } else {                                                                   \
+      CREATE_THREAD_POOL(THREADS);                                             \
+      BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
+      suite.FUNC(iters);                                                       \
+    }                                                                          \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
+
+
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
+  static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
+    StopBenchmarkTiming();                                                     \
+    CREATE_THREAD_POOL(THREADS);                                               \
+    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc
new file mode 100644
index 000000000..adea754ad
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_gpu.cc
@@ -0,0 +1,75 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+
+
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(reduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);