From 9930e9583b336edda281a4490cfe69c53082318e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 2 Mar 2015 18:08:38 -0500
Subject: Improve analyze-blocking-sizes, and in particular give it a
 evaluate-defaults tool that shows the efficiency of Eigen's default blocking
 sizes choices, using a previously computed table from
 benchmark-blocking-sizes.

---
 bench/analyze-blocking-sizes.cpp | 232 +++++++++++++++++++++++++++++++--------
 1 file changed, 186 insertions(+), 46 deletions(-)

(limited to 'bench/analyze-blocking-sizes.cpp')
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index a603c0216..f507eab38 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -7,6 +7,10 @@
 #include <string>
 #include <cmath>
 #include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
 
 using namespace std;
 
@@ -88,6 +92,11 @@ struct preprocessed_inputfile_entry_t
   float efficiency;
 };
 
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+  return e1.efficiency < e2.efficiency;
+}
+
 struct preprocessed_inputfile_t
 {
   string filename;
@@ -394,63 +403,194 @@ void print_partition(
   cout << endl;
 }
 
-int main(int argc, char* argv[])
+struct action_t
 {
-  if (argc == 1) {
-    cerr << "usage: " << argv[0] << " [input files]" << endl;
-    cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
-    exit(1);
-  }
-  cout.precision(3);
-  cerr.precision(3);
-  vector<string> inputfilenames;
-  for (int i = 1; i < argc; i++) {
-  	inputfilenames.emplace_back(argv[i]);
-  }
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const { abort(); }
+  virtual ~action_t() {}
+};
 
-  vector<preprocessed_inputfile_t> preprocessed_inputfiles;
-  for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
-    preprocessed_inputfiles.emplace_back(inputfile_t(*it));
+struct partition_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "partition"; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  {
+    check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+    float required_efficiency_to_beat = 0.0f;
+    vector<vector<vector<size_t>>> partitions;
+    cerr << "searching for partitions...\r" << flush;
+    while (true)
+    {
+      vector<vector<size_t>> partition;
+      find_partition_with_efficiency_higher_than(
+        preprocessed_inputfiles,
+        required_efficiency_to_beat,
+        partition);
+      float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+      cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+           << " subsets for " << 100.0f * actual_efficiency
+           << " % efficiency"
+           << "                  \r" << flush;
+      partitions.push_back(partition);
+      if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+        break;
+      }
+      required_efficiency_to_beat = actual_efficiency;
+    }
+    cerr << "                                                                  " << endl;
+    while (true) {
+      bool repeat = false;
+      for (size_t i = 0; i < partitions.size() - 1; i++) {
+        if (partitions[i].size() >= partitions[i+1].size()) {
+          partitions.erase(partitions.begin() + i);
+          repeat = true;
+          break;
+        }
+      }
+      if (!repeat) {
+        break;
+      }
+    }
+    for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+      print_partition(preprocessed_inputfiles, *it);
+    }
   }
+};
 
-  check_all_files_in_same_exact_order(preprocessed_inputfiles);
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
 
-  float required_efficiency_to_beat = 0.0f;
-  vector<vector<vector<size_t>>> partitions;
-  cerr << "searching for partitions...\r" << flush;
-  while (true)
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  size_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
   {
-    vector<vector<size_t>> partition;
-    find_partition_with_efficiency_higher_than(
-      preprocessed_inputfiles,
-      required_efficiency_to_beat,
-      partition);
-    float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
-    cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
-         << " subsets for " << 100.0f * actual_efficiency
-         << " % efficiency"
-         << "                  \r" << flush;
-    partitions.push_back(partition);
-    if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
-      break;
-    }
-    required_efficiency_to_beat = actual_efficiency;
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
   }
-  cerr << "                                                                  " << endl;
-  while (true) {
-    bool repeat = false;
-    for (size_t i = 0; i < partitions.size() - 1; i++) {
-      if (partitions[i].size() >= partitions[i+1].size()) {
-        partitions.erase(partitions.begin() + i);
-        repeat = true;
-        break;
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "evaluate-defaults"; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  {
+    if (preprocessed_inputfiles.size() > 1) {
+      cerr << invokation_name() << " only works with one input file." << endl;
+      exit(1);
+    }
+
+    const preprocessed_inputfile_t& preprocessed_inputfile = preprocessed_inputfiles.front();
+
+    uint16_t product_size = 0;
+    uint16_t default_block_size = 0;
+    vector<preprocessed_inputfile_entry_t> results, cubic_results;
+    for (auto it = preprocessed_inputfile.entries.begin(); it != preprocessed_inputfile.entries.end(); ++it) {
+      if (it->product_size != product_size) {
+        product_size = it->product_size;
+        size_triple_t product_size_triple(product_size);
+        Eigen::Index k = product_size_triple.k,
+                     m = product_size_triple.m,
+                     n = product_size_triple.n;
+        Eigen::internal::computeProductBlockingSizes<float, float>(k, m, n);
+        default_block_size = compact_size_triple(k, m, n);
+      }
+      if (it->block_size == default_block_size) {
+        results.push_back(*it);
+        if (size_triple_t(product_size).is_cubic()) {
+          cubic_results.push_back(*it);
+        }
       }
     }
-    if (!repeat) {
+
+    cerr << "Below are all results - first column is product size tripe, " << endl
+         << "second column is block size triple, in hex kmn form where" << endl
+         << "k, m, n are the log2 of the actual values, i.e. a89 means" << endl
+         << "k=1024, m=256, n=512." << endl << endl;
+    for (auto it = results.begin(); it != results.end(); ++it) {
+      cerr << hex << it->product_size << " " << it->block_size
+           << " efficiency " << std::dec << 100.0f * it->efficiency << " %" << endl;
+    }
+    cerr << endl;
+    sort(results.begin(), results.end(), lower_efficiency);
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    cerr << "Efficiency summary: min = "
+         << 100.0f * results.front().efficiency << " %, max = "
+         << 100.0f * results.back().efficiency << " %, median = "
+         << 100.0f * results[results.size() / 2].efficiency << " %" << endl;
+    cerr << "20% of product sizes have efficiency <= " << 100.0f * results[results.size() * 20 / 100].efficiency << " %" << endl;
+    cerr << "10% of product sizes have efficiency <= " << 100.0f * results[results.size() * 10 / 100].efficiency << " %" << endl;
+    cerr << "5% of product sizes have efficiency <= " << 100.0f * results[results.size() * 5 / 100].efficiency << " %" << endl;
+    cerr << "Cubic sizes efficiency summary: min = "
+         << 100.0f * cubic_results.front().efficiency << " %, max = "
+         << 100.0f * cubic_results.back().efficiency << " %, median = "
+         << 100.0f * cubic_results[cubic_results.size() / 2].efficiency << " %" << endl;
+    
+  }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> <input files...>" << endl;
+  cerr << "available actions:" << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  } 
+  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+  exit(1);
+}
+
+int main(int argc, char* argv[])
+{
+  cout.precision(3);
+  cerr.precision(3);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new partition_action_t);
+  available_actions.emplace_back(new evaluate_defaults_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 2) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
       break;
     }
   }
-  for (auto it = partitions.begin(); it != partitions.end(); ++it) {
-    print_partition(preprocessed_inputfiles, *it);
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
   }
+
+  vector<string> inputfilenames;
+  for (int i = 2; i < argc; i++) {
+  	inputfilenames.emplace_back(argv[i]);
+  }
+
+  vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+  for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
+    preprocessed_inputfiles.emplace_back(inputfile_t(*it));
+  }
+
+  (*action)->run(preprocessed_inputfiles);
 }
-- 
cgit v1.2.3