aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench
diff options
context:
space:
mode:
Diffstat (limited to 'bench')
-rw-r--r--bench/analyze-blocking-sizes.cpp876
-rw-r--r--bench/bench_gemm.cpp16
-rw-r--r--bench/bench_norm.cpp33
-rw-r--r--bench/benchmark-blocking-sizes.cpp677
-rw-r--r--bench/btl/CMakeLists.txt1
-rw-r--r--bench/btl/cmake/FindACML.cmake2
-rw-r--r--bench/btl/cmake/FindATLAS.cmake13
-rw-r--r--bench/btl/cmake/FindCBLAS.cmake1
-rw-r--r--bench/btl/cmake/FindOPENBLAS.cmake2
-rw-r--r--bench/btl/libs/eigen2/eigen2_interface.hh2
-rw-r--r--bench/btl/libs/tensors/CMakeLists.txt44
-rw-r--r--bench/btl/libs/tensors/main_linear.cpp23
-rw-r--r--bench/btl/libs/tensors/main_matmat.cpp21
-rw-r--r--bench/btl/libs/tensors/main_vecmat.cpp21
-rw-r--r--bench/btl/libs/tensors/tensor_interface.hh105
-rw-r--r--bench/perf_monitoring/gemm/changesets.txt45
-rw-r--r--bench/perf_monitoring/gemm/gemm.cpp67
-rwxr-xr-xbench/perf_monitoring/gemm/make_plot.sh37
-rwxr-xr-xbench/perf_monitoring/gemm/run_gemm.sh152
-rw-r--r--bench/perf_monitoring/gemm/settings.txt15
-rw-r--r--bench/tensors/tensor_benchmarks.h305
-rw-r--r--bench/tensors/tensor_benchmarks_cpu.cc156
-rw-r--r--bench/tensors/tensor_benchmarks_gpu.cc75
23 files changed, 2662 insertions, 27 deletions
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
new file mode 100644
index 000000000..d563a1d2d
--- /dev/null
+++ b/bench/analyze-blocking-sizes.cpp
@@ -0,0 +1,876 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
+
+using namespace std;
+
+const int default_precision = 4;
+
+// see --only-cubic-sizes
+bool only_cubic_sizes = false;
+
+// see --dump-tables
+bool dump_tables = false;
+
+uint8_t log2_pot(size_t x) {
+ size_t l = 0;
+ while (x >>= 1) l++;
+ return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+ return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+ uint16_t k, m, n;
+ size_triple_t() : k(0), m(0), n(0) {}
+ size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+ size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+ size_triple_t(uint16_t compact)
+ {
+ k = 1 << ((compact & 0xf00) >> 8);
+ m = 1 << ((compact & 0x0f0) >> 4);
+ n = 1 << ((compact & 0x00f) >> 0);
+ }
+ bool is_cubic() const { return k == m && m == n; }
+};
+
+ostream& operator<<(ostream& s, const size_triple_t& t)
+{
+ return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
+}
+
+struct inputfile_entry_t
+{
+ uint16_t product_size;
+ uint16_t pot_block_size;
+ size_triple_t nonpot_block_size;
+ float gflops;
+};
+
+struct inputfile_t
+{
+ enum class type_t {
+ unknown,
+ all_pot_sizes,
+ default_sizes
+ };
+
+ string filename;
+ vector<inputfile_entry_t> entries;
+ type_t type;
+
+ inputfile_t(const string& fname)
+ : filename(fname)
+ , type(type_t::unknown)
+ {
+ ifstream stream(filename);
+ if (!stream.is_open()) {
+ cerr << "couldn't open input file: " << filename << endl;
+ exit(1);
+ }
+ string line;
+ while (getline(stream, line)) {
+ if (line.empty()) continue;
+ if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
+ if (type != type_t::unknown) {
+ cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+ exit(1);
+ }
+ type = type_t::all_pot_sizes;
+ continue;
+ }
+ if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
+ if (type != type_t::unknown) {
+ cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+ exit(1);
+ }
+ type = type_t::default_sizes;
+ continue;
+ }
+
+
+ if (type == type_t::unknown) {
+ continue;
+ }
+ switch(type) {
+ case type_t::all_pot_sizes: {
+ unsigned int product_size, block_size;
+ float gflops;
+ int sscanf_result =
+ sscanf(line.c_str(), "%x %x %f",
+ &product_size,
+ &block_size,
+ &gflops);
+ if (3 != sscanf_result ||
+ !product_size ||
+ product_size > 0xfff ||
+ !block_size ||
+ block_size > 0xfff ||
+ !isfinite(gflops))
+ {
+ cerr << "ill-formed input file: " << filename << endl;
+ cerr << "offending line:" << endl << line << endl;
+ exit(1);
+ }
+ if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+ continue;
+ }
+ inputfile_entry_t entry;
+ entry.product_size = uint16_t(product_size);
+ entry.pot_block_size = uint16_t(block_size);
+ entry.gflops = gflops;
+ entries.push_back(entry);
+ break;
+ }
+ case type_t::default_sizes: {
+ unsigned int product_size;
+ float gflops;
+ int bk, bm, bn;
+ int sscanf_result =
+ sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
+ &product_size,
+ &bk, &bm, &bn,
+ &gflops);
+ if (5 != sscanf_result ||
+ !product_size ||
+ product_size > 0xfff ||
+ !isfinite(gflops))
+ {
+ cerr << "ill-formed input file: " << filename << endl;
+ cerr << "offending line:" << endl << line << endl;
+ exit(1);
+ }
+ if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+ continue;
+ }
+ inputfile_entry_t entry;
+ entry.product_size = uint16_t(product_size);
+ entry.pot_block_size = 0;
+ entry.nonpot_block_size = size_triple_t(bk, bm, bn);
+ entry.gflops = gflops;
+ entries.push_back(entry);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ stream.close();
+ if (type == type_t::unknown) {
+ cerr << "Unrecognized input file " << filename << endl;
+ exit(1);
+ }
+ if (entries.empty()) {
+ cerr << "didn't find any measurements in input file: " << filename << endl;
+ exit(1);
+ }
+ }
+};
+
+struct preprocessed_inputfile_entry_t
+{
+ uint16_t product_size;
+ uint16_t block_size;
+
+ float efficiency;
+};
+
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+ return e1.efficiency < e2.efficiency;
+}
+
+struct preprocessed_inputfile_t
+{
+ string filename;
+ vector<preprocessed_inputfile_entry_t> entries;
+
+ preprocessed_inputfile_t(const inputfile_t& inputfile)
+ : filename(inputfile.filename)
+ {
+ if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
+ abort();
+ }
+ auto it = inputfile.entries.begin();
+ auto it_first_with_given_product_size = it;
+ while (it != inputfile.entries.end()) {
+ ++it;
+ if (it == inputfile.entries.end() ||
+ it->product_size != it_first_with_given_product_size->product_size)
+ {
+ import_input_file_range_one_product_size(it_first_with_given_product_size, it);
+ it_first_with_given_product_size = it;
+ }
+ }
+ }
+
+private:
+ void import_input_file_range_one_product_size(
+ const vector<inputfile_entry_t>::const_iterator& begin,
+ const vector<inputfile_entry_t>::const_iterator& end)
+ {
+ uint16_t product_size = begin->product_size;
+ float max_gflops = 0.0f;
+ for (auto it = begin; it != end; ++it) {
+ if (it->product_size != product_size) {
+ cerr << "Unexpected ordering of entries in " << filename << endl;
+ cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
+ exit(1);
+ }
+ max_gflops = max(max_gflops, it->gflops);
+ }
+ for (auto it = begin; it != end; ++it) {
+ preprocessed_inputfile_entry_t entry;
+ entry.product_size = it->product_size;
+ entry.block_size = it->pot_block_size;
+ entry.efficiency = it->gflops / max_gflops;
+ entries.push_back(entry);
+ }
+ }
+};
+
+void check_all_files_in_same_exact_order(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
+{
+ if (preprocessed_inputfiles.empty()) {
+ return;
+ }
+
+ const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
+ const size_t num_entries = first_file.entries.size();
+
+ for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+ if (preprocessed_inputfiles[i].entries.size() != num_entries) {
+ cerr << "these files have different number of entries: "
+ << preprocessed_inputfiles[i].filename
+ << " and "
+ << first_file.filename
+ << endl;
+ exit(1);
+ }
+ }
+
+ for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
+ const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
+ const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
+ for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
+ const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
+ if (cur_file.entries[entry_index].product_size != entry_product_size ||
+ cur_file.entries[entry_index].block_size != entry_block_size)
+ {
+ cerr << "entries not in same order between these files: "
+ << first_file.filename
+ << " and "
+ << cur_file.filename
+ << endl;
+ exit(1);
+ }
+ }
+ }
+}
+
+float efficiency_of_subset(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ const vector<size_t>& subset)
+{
+ if (subset.size() <= 1) {
+ return 1.0f;
+ }
+ const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+ const size_t num_entries = first_file.entries.size();
+ float efficiency = 1.0f;
+ size_t entry_index = 0;
+ size_t first_entry_index_with_this_product_size = 0;
+ uint16_t product_size = first_file.entries[0].product_size;
+ while (entry_index < num_entries) {
+ ++entry_index;
+ if (entry_index == num_entries ||
+ first_file.entries[entry_index].product_size != product_size)
+ {
+ float efficiency_this_product_size = 0.0f;
+ for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+ float efficiency_this_entry = 1.0f;
+ for (auto i = subset.begin(); i != subset.end(); ++i) {
+ efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+ }
+ efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
+ }
+ efficiency = min(efficiency, efficiency_this_product_size);
+ if (entry_index < num_entries) {
+ first_entry_index_with_this_product_size = entry_index;
+ product_size = first_file.entries[entry_index].product_size;
+ }
+ }
+ }
+
+ return efficiency;
+}
+
+void dump_table_for_subset(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ const vector<size_t>& subset)
+{
+ const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+ const size_t num_entries = first_file.entries.size();
+ size_t entry_index = 0;
+ size_t first_entry_index_with_this_product_size = 0;
+ uint16_t product_size = first_file.entries[0].product_size;
+ size_t i = 0;
+ size_triple_t min_product_size(first_file.entries.front().product_size);
+ size_triple_t max_product_size(first_file.entries.back().product_size);
+ if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
+ abort();
+ }
+ if (only_cubic_sizes) {
+ cerr << "Can't generate tables with --only-cubic-sizes." << endl;
+ abort();
+ }
+ cout << "struct LookupTable {" << endl;
+ cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl;
+ const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
+ const size_t TableSize = NumSizes * NumSizes * NumSizes;
+ cout << " static const size_t NumSizes = " << NumSizes << ";" << endl;
+ cout << " static const unsigned short* Data() {" << endl;
+ cout << " static const unsigned short data[" << TableSize << "] = {";
+ while (entry_index < num_entries) {
+ ++entry_index;
+ if (entry_index == num_entries ||
+ first_file.entries[entry_index].product_size != product_size)
+ {
+ float best_efficiency_this_product_size = 0.0f;
+ uint16_t best_block_size_this_product_size = 0;
+ for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+ float efficiency_this_entry = 1.0f;
+ for (auto i = subset.begin(); i != subset.end(); ++i) {
+ efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+ }
+ if (efficiency_this_entry > best_efficiency_this_product_size) {
+ best_efficiency_this_product_size = efficiency_this_entry;
+ best_block_size_this_product_size = first_file.entries[e].block_size;
+ }
+ }
+ if ((i++) % NumSizes) {
+ cout << " ";
+ } else {
+ cout << endl << " ";
+ }
+ cout << "0x" << hex << best_block_size_this_product_size << dec;
+ if (entry_index < num_entries) {
+ cout << ",";
+ first_entry_index_with_this_product_size = entry_index;
+ product_size = first_file.entries[entry_index].product_size;
+ }
+ }
+ }
+ if (i != TableSize) {
+ cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
+ abort();
+ }
+ cout << endl << " };" << endl;
+ cout << " return data;" << endl;
+ cout << " }" << endl;
+ cout << "};" << endl;
+}
+
+float efficiency_of_partition(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ const vector<vector<size_t>>& partition)
+{
+ float efficiency = 1.0f;
+ for (auto s = partition.begin(); s != partition.end(); ++s) {
+ efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
+ }
+ return efficiency;
+}
+
+void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
+{
+ assert(subset_size >= 1 && subset_size <= set_size);
+ out_subset.resize(subset_size);
+ for (size_t i = 0; i < subset_size; i++) {
+ out_subset[i] = i;
+ }
+}
+
+bool is_last_subset(const vector<size_t>& subset, size_t set_size)
+{
+ return subset[0] == set_size - subset.size();
+}
+
+void next_subset(vector<size_t>& inout_subset, size_t set_size)
+{
+ if (is_last_subset(inout_subset, set_size)) {
+ cerr << "iterating past the last subset" << endl;
+ abort();
+ }
+ size_t i = 1;
+ while (inout_subset[inout_subset.size() - i] == set_size - i) {
+ i++;
+ assert(i <= inout_subset.size());
+ }
+ size_t first_index_to_change = inout_subset.size() - i;
+ inout_subset[first_index_to_change]++;
+ size_t p = inout_subset[first_index_to_change];
+ for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
+ inout_subset[j] = ++p;
+ }
+}
+
+const size_t number_of_subsets_limit = 100;
+const size_t always_search_subsets_of_size_at_least = 2;
+
+bool is_number_of_subsets_feasible(size_t n, size_t p)
+{
+ assert(n>0 && p>0 && p<=n);
+ uint64_t numerator = 1, denominator = 1;
+ for (size_t i = 0; i < p; i++) {
+ numerator *= n - i;
+ denominator *= i + 1;
+ if (numerator > denominator * number_of_subsets_limit) {
+ return false;
+ }
+ }
+ return true;
+}
+
+size_t max_feasible_subset_size(size_t n)
+{
+ assert(n > 0);
+ const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
+ for (size_t p = 1; p <= n - 1; p++) {
+ if (!is_number_of_subsets_feasible(n, p+1)) {
+ return max(p, minresult);
+ }
+ }
+ return n - 1;
+}
+
+void find_subset_with_efficiency_higher_than(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ float required_efficiency_to_beat,
+ vector<size_t>& inout_remainder,
+ vector<size_t>& out_subset)
+{
+ out_subset.resize(0);
+
+ if (required_efficiency_to_beat >= 1.0f) {
+ cerr << "can't beat efficiency 1." << endl;
+ abort();
+ }
+
+ while (!inout_remainder.empty()) {
+
+ vector<size_t> candidate_indices(inout_remainder.size());
+ for (size_t i = 0; i < candidate_indices.size(); i++) {
+ candidate_indices[i] = i;
+ }
+
+ size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
+ while (candidate_indices_subset_size >= 1) {
+ vector<size_t> candidate_indices_subset;
+ make_first_subset(candidate_indices_subset_size,
+ candidate_indices_subset,
+ candidate_indices.size());
+
+ vector<size_t> best_candidate_indices_subset;
+ float best_efficiency = 0.0f;
+ vector<size_t> trial_subset = out_subset;
+ trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
+ while (true)
+ {
+ for (size_t i = 0; i < candidate_indices_subset_size; i++) {
+ trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
+ }
+
+ float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+ if (trial_efficiency > best_efficiency) {
+ best_efficiency = trial_efficiency;
+ best_candidate_indices_subset = candidate_indices_subset;
+ }
+ if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
+ break;
+ }
+ next_subset(candidate_indices_subset, candidate_indices.size());
+ }
+
+ if (best_efficiency > required_efficiency_to_beat) {
+ for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
+ candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
+ }
+ candidate_indices.resize(best_candidate_indices_subset.size());
+ }
+ candidate_indices_subset_size--;
+ }
+
+ size_t candidate_index = candidate_indices[0];
+ auto candidate_iterator = inout_remainder.begin() + candidate_index;
+ vector<size_t> trial_subset = out_subset;
+
+ trial_subset.push_back(*candidate_iterator);
+ float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
+ if (trial_efficiency > required_efficiency_to_beat) {
+ out_subset.push_back(*candidate_iterator);
+ inout_remainder.erase(candidate_iterator);
+ } else {
+ break;
+ }
+ }
+}
+
+void find_partition_with_efficiency_higher_than(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ float required_efficiency_to_beat,
+ vector<vector<size_t>>& out_partition)
+{
+ out_partition.resize(0);
+
+ vector<size_t> remainder;
+ for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
+ remainder.push_back(i);
+ }
+
+ while (!remainder.empty()) {
+ vector<size_t> new_subset;
+ find_subset_with_efficiency_higher_than(
+ preprocessed_inputfiles,
+ required_efficiency_to_beat,
+ remainder,
+ new_subset);
+ out_partition.push_back(new_subset);
+ }
+}
+
+void print_partition(
+ const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+ const vector<vector<size_t>>& partition)
+{
+ float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+ cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency" << endl;
+ for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
+ cout << " Subset " << (subset - partition.begin())
+ << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
+ << endl;
+ for (auto file = subset->begin(); file != subset->end(); ++file) {
+ cout << " " << preprocessed_inputfiles[*file].filename << endl;
+ }
+ if (dump_tables) {
+ cout << " Table:" << endl;
+ dump_table_for_subset(preprocessed_inputfiles, *subset);
+ }
+ }
+ cout << endl;
+}
+
+struct action_t
+{
+ virtual const char* invokation_name() const { abort(); return nullptr; }
+ virtual void run(const vector<string>&) const { abort(); }
+ virtual ~action_t() {}
+};
+
+struct partition_action_t : action_t
+{
+ virtual const char* invokation_name() const override { return "partition"; }
+ virtual void run(const vector<string>& input_filenames) const override
+ {
+ vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+
+ if (input_filenames.empty()) {
+ cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
+ exit(1);
+ }
+
+ for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
+ inputfile_t inputfile(*it);
+ switch (inputfile.type) {
+ case inputfile_t::type_t::all_pot_sizes:
+ preprocessed_inputfiles.emplace_back(inputfile);
+ break;
+ case inputfile_t::type_t::default_sizes:
+ cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
+ << "has no use for " << *it << " which contains measurements for default sizes." << endl;
+ exit(1);
+ break;
+ default:
+ cerr << "Unrecognized input file: " << *it << endl;
+ exit(1);
+ }
+ }
+
+ check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+ float required_efficiency_to_beat = 0.0f;
+ vector<vector<vector<size_t>>> partitions;
+ cerr << "searching for partitions...\r" << flush;
+ while (true)
+ {
+ vector<vector<size_t>> partition;
+ find_partition_with_efficiency_higher_than(
+ preprocessed_inputfiles,
+ required_efficiency_to_beat,
+ partition);
+ float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+ cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+ << " subsets for " << 100.0f * actual_efficiency
+ << " % efficiency"
+ << " \r" << flush;
+ partitions.push_back(partition);
+ if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+ break;
+ }
+ required_efficiency_to_beat = actual_efficiency;
+ }
+ cerr << " " << endl;
+ while (true) {
+ bool repeat = false;
+ for (size_t i = 0; i < partitions.size() - 1; i++) {
+ if (partitions[i].size() >= partitions[i+1].size()) {
+ partitions.erase(partitions.begin() + i);
+ repeat = true;
+ break;
+ }
+ }
+ if (!repeat) {
+ break;
+ }
+ }
+ for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+ print_partition(preprocessed_inputfiles, *it);
+ }
+ }
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+ struct results_entry_t {
+ uint16_t product_size;
+ size_triple_t default_block_size;
+ uint16_t best_pot_block_size;
+ float default_gflops;
+ float best_pot_gflops;
+ float default_efficiency;
+ };
+ friend ostream& operator<<(ostream& s, const results_entry_t& entry)
+ {
+ return s
+ << "Product size " << size_triple_t(entry.product_size)
+ << ": default block size " << entry.default_block_size
+ << " -> " << entry.default_gflops
+ << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
+ << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
+ << " -> " << entry.best_pot_gflops
+ << " GFlop/s" << dec;
+ }
+ static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
+ return e1.default_efficiency < e2.default_efficiency;
+ }
+ virtual const char* invokation_name() const override { return "evaluate-defaults"; }
+ void show_usage_and_exit() const
+ {
+ cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
+ cerr << "checks how well the performance with default sizes compares to the best "
+ << "performance measured over all POT sizes." << endl;
+ exit(1);
+ }
+ virtual void run(const vector<string>& input_filenames) const override
+ {
+ if (input_filenames.size() != 2) {
+ show_usage_and_exit();
+ }
+ inputfile_t inputfile_default_sizes(input_filenames[0]);
+ inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
+ if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
+ cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
+ show_usage_and_exit();
+ }
+ if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
+ cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
+ show_usage_and_exit();
+ }
+ vector<results_entry_t> results;
+ vector<results_entry_t> cubic_results;
+
+ uint16_t product_size = 0;
+ auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
+ for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
+ it_default_sizes != inputfile_default_sizes.entries.end();
+ ++it_default_sizes)
+ {
+ if (it_default_sizes->product_size == product_size) {
+ continue;
+ }
+ product_size = it_default_sizes->product_size;
+ while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
+ it_all_pot_sizes->product_size != product_size)
+ {
+ ++it_all_pot_sizes;
+ }
+ if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
+ break;
+ }
+ uint16_t best_pot_block_size = 0;
+ float best_pot_gflops = 0;
+ for (auto it = it_all_pot_sizes;
+ it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
+ ++it)
+ {
+ if (it->gflops > best_pot_gflops) {
+ best_pot_gflops = it->gflops;
+ best_pot_block_size = it->pot_block_size;
+ }
+ }
+ results_entry_t entry;
+ entry.product_size = product_size;
+ entry.default_block_size = it_default_sizes->nonpot_block_size;
+ entry.best_pot_block_size = best_pot_block_size;
+ entry.default_gflops = it_default_sizes->gflops;
+ entry.best_pot_gflops = best_pot_gflops;
+ entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
+ results.push_back(entry);
+
+ size_triple_t t(product_size);
+ if (t.k == t.m && t.m == t.n) {
+ cubic_results.push_back(entry);
+ }
+ }
+
+ cout << "All results:" << endl;
+ for (auto it = results.begin(); it != results.end(); ++it) {
+ cout << *it << endl;
+ }
+ cout << endl;
+
+ sort(results.begin(), results.end(), lower_efficiency);
+
+ const size_t n = min<size_t>(20, results.size());
+ cout << n << " worst results:" << endl;
+ for (size_t i = 0; i < n; i++) {
+ cout << results[i] << endl;
+ }
+ cout << endl;
+
+ cout << "cubic results:" << endl;
+ for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
+ cout << *it << endl;
+ }
+ cout << endl;
+
+ sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+
+ cout.precision(2);
+ vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
+ for (auto it = a.begin(); it != a.end(); ++it) {
+ size_t n = min(results.size() - 1, size_t(*it * results.size()));
+ cout << (100.0f * n / (results.size() - 1))
+ << " % of product sizes have default efficiency <= "
+ << 100.0f * results[n].default_efficiency << " %" << endl;
+ }
+ cout.precision(default_precision);
+ }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+ const vector<unique_ptr<action_t>>& available_actions)
+{
+ cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
+ cerr << "available actions:" << endl;
+ for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+ cerr << " " << (*it)->invokation_name() << endl;
+ }
+ cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+ exit(1);
+}
+
+int main(int argc, char* argv[])
+{
+ cout.precision(default_precision);
+ cerr.precision(default_precision);
+
+ vector<unique_ptr<action_t>> available_actions;
+ available_actions.emplace_back(new partition_action_t);
+ available_actions.emplace_back(new evaluate_defaults_action_t);
+
+ vector<string> input_filenames;
+
+ action_t* action = nullptr;
+
+ if (argc < 2) {
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+ for (int i = 1; i < argc; i++) {
+ bool arg_handled = false;
+ // Step 1. Try to match action invokation names.
+ for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+ if (!strcmp(argv[i], (*it)->invokation_name())) {
+ if (!action) {
+ action = it->get();
+ arg_handled = true;
+ break;
+ } else {
+ cerr << "can't specify more than one action!" << endl;
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+ }
+ }
+ if (arg_handled) {
+ continue;
+ }
+ // Step 2. Try to match option names.
+ if (argv[i][0] == '-') {
+ if (!strcmp(argv[i], "--only-cubic-sizes")) {
+ only_cubic_sizes = true;
+ arg_handled = true;
+ }
+ if (!strcmp(argv[i], "--dump-tables")) {
+ dump_tables = true;
+ arg_handled = true;
+ }
+ if (!arg_handled) {
+ cerr << "Unrecognized option: " << argv[i] << endl;
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+ }
+ if (arg_handled) {
+ continue;
+ }
+ // Step 3. Default to interpreting args as input filenames.
+ input_filenames.emplace_back(argv[i]);
+ }
+
+ if (dump_tables && only_cubic_sizes) {
+ cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+
+ if (!action) {
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+
+ action->run(input_filenames);
+}
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 8222271fb..0974ebe4c 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -148,7 +148,7 @@ int main(int argc, char ** argv)
int m = s;
int n = s;
int p = s;
- int cache_size = -1;
+ int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
bool need_help = false;
for (int i=1; i<argc;)
@@ -169,7 +169,13 @@ int main(int argc, char ** argv)
else if(argv[i][1]=='c')
{
++i;
- cache_size = atoi(argv[i++]);
+ cache_size1 = atoi(argv[i++]);
+ if(argv[i][0]!='-')
+ {
+ cache_size2 = atoi(argv[i++]);
+ if(argv[i][0]!='-')
+ cache_size3 = atoi(argv[i++]);
+ }
}
else if(argv[i][1]=='t')
{
@@ -191,14 +197,14 @@ int main(int argc, char ** argv)
if(need_help)
{
- std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
+ std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
std::cout << " <matrix sizes> : size\n";
std::cout << " <matrix sizes> : rows columns depth\n";
return 1;
}
- if(cache_size>0)
- setCpuCacheSizes(cache_size,96*cache_size);
+ if(cache_size1>0)
+ setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
A a(m,p); a.setRandom();
diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp
index 398fef835..129afcfb2 100644
--- a/bench/bench_norm.cpp
+++ b/bench/bench_norm.cpp
@@ -6,19 +6,25 @@ using namespace Eigen;
using namespace std;
template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v)
{
return v.norm();
}
template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v)
+{
+ return v.stableNorm();
+}
+
+template<typename T>
+EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v)
{
return v.hypotNorm();
}
template<typename T>
-EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v)
+EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v)
{
return v.blueNorm();
}
@@ -217,20 +223,21 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
}
#define BENCH_PERF(NRM) { \
+ float af = 0; double ad = 0; std::complex<float> ac = 0; \
Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
for (int k=0; k<tries; ++k) { \
tf.start(); \
- for (int i=0; i<iters; ++i) NRM(vf); \
+ for (int i=0; i<iters; ++i) { af += NRM(vf); } \
tf.stop(); \
} \
for (int k=0; k<tries; ++k) { \
td.start(); \
- for (int i=0; i<iters; ++i) NRM(vd); \
+ for (int i=0; i<iters; ++i) { ad += NRM(vd); } \
td.stop(); \
} \
/*for (int k=0; k<std::max(1,tries/3); ++k) { \
tcf.start(); \
- for (int i=0; i<iters; ++i) NRM(vcf); \
+ for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \
tcf.stop(); \
} */\
std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \
@@ -316,14 +323,17 @@ int main(int argc, char** argv)
std::cout << "\n";
}
+ y = 1;
std::cout.precision(4);
- std::cerr << "Performance (out of cache):\n";
+ int s1 = 1024*1024*32;
+ std::cerr << "Performance (out of cache, " << s1 << "):\n";
{
int iters = 1;
- VectorXf vf = VectorXf::Random(1024*1024*32) * y;
- VectorXd vd = VectorXd::Random(1024*1024*32) * y;
- VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y;
+ VectorXf vf = VectorXf::Random(s1) * y;
+ VectorXd vd = VectorXd::Random(s1) * y;
+ VectorXcf vcf = VectorXcf::Random(s1) * y;
BENCH_PERF(sqsumNorm);
+ BENCH_PERF(stableNorm);
BENCH_PERF(blueNorm);
BENCH_PERF(pblueNorm);
BENCH_PERF(lapackNorm);
@@ -332,13 +342,14 @@ int main(int argc, char** argv)
BENCH_PERF(bl2passNorm);
}
- std::cerr << "\nPerformance (in cache):\n";
+ std::cerr << "\nPerformance (in cache, " << 512 << "):\n";
{
int iters = 100000;
VectorXf vf = VectorXf::Random(512) * y;
VectorXd vd = VectorXd::Random(512) * y;
VectorXcf vcf = VectorXcf::Random(512) * y;
BENCH_PERF(sqsumNorm);
+ BENCH_PERF(stableNorm);
BENCH_PERF(blueNorm);
BENCH_PERF(pblueNorm);
BENCH_PERF(lapackNorm);
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
new file mode 100644
index 000000000..827be2880
--- /dev/null
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -0,0 +1,677 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+#include <fstream>
+#include <memory>
+#include <cstdio>
+
+bool eigen_use_specific_block_size;
+int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
+#include <Eigen/Core>
+
+#include <bench/BenchTimer.h>
+
+using namespace Eigen;
+using namespace std;
+
+static BenchTimer timer;
+
+// how many times we repeat each measurement.
+// measurements are randomly shuffled - we're not doing
+// all N identical measurements in a row.
+const int measurement_repetitions = 3;
+
+// Timings below this value are too short to be accurate,
+// we'll repeat measurements with more iterations until
+// we get a timing above that threshold.
+const float min_accurate_time = 1e-2f;
+
+// See --min-working-set-size command line parameter.
+size_t min_working_set_size = 0;
+
+float max_clock_speed = 0.0f;
+
+// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
+const size_t maxsize = 2048;
+const size_t minsize = 16;
+
+typedef MatrixXf MatrixType;
+typedef MatrixType::Scalar Scalar;
+typedef internal::packet_traits<Scalar>::type Packet;
+
+static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
+static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
+static_assert(maxsize > minsize, "maxsize must be larger than minsize");
+static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+ size_t k, m, n;
+ size_triple_t() : k(0), m(0), n(0) {}
+ size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+ size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+ size_triple_t(uint16_t compact)
+ {
+ k = 1 << ((compact & 0xf00) >> 8);
+ m = 1 << ((compact & 0x0f0) >> 4);
+ n = 1 << ((compact & 0x00f) >> 0);
+ }
+};
+
+uint8_t log2_pot(size_t x) {
+ size_t l = 0;
+ while (x >>= 1) l++;
+ return l;
+}
+
+// Convert between size tripes and a compact form fitting in 12 bits
+// where each size, which must be a POT, is encoded as its log2, on 4 bits
+// so the largest representable size is 2^15 == 32k ... big enough.
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+ return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+uint16_t compact_size_triple(const size_triple_t& t)
+{
+ return compact_size_triple(t.k, t.m, t.n);
+}
+
+// A single benchmark. Initially only contains benchmark params.
+// Then call run(), which stores the result in the gflops field.
+struct benchmark_t
+{
+ uint16_t compact_product_size;
+ uint16_t compact_block_size;
+ bool use_default_block_size;
+ float gflops;
+ benchmark_t()
+ : compact_product_size(0)
+ , compact_block_size(0)
+ , use_default_block_size(false)
+ , gflops(0)
+ {
+ }
+ benchmark_t(size_t pk, size_t pm, size_t pn,
+ size_t bk, size_t bm, size_t bn)
+ : compact_product_size(compact_size_triple(pk, pm, pn))
+ , compact_block_size(compact_size_triple(bk, bm, bn))
+ , use_default_block_size(false)
+ , gflops(0)
+ {}
+ benchmark_t(size_t pk, size_t pm, size_t pn)
+ : compact_product_size(compact_size_triple(pk, pm, pn))
+ , compact_block_size(0)
+ , use_default_block_size(true)
+ , gflops(0)
+ {}
+
+ void run();
+};
+
+ostream& operator<<(ostream& s, const benchmark_t& b)
+{
+ s << hex << b.compact_product_size << dec;
+ if (b.use_default_block_size) {
+ size_triple_t t(b.compact_product_size);
+ Index k = t.k, m = t.m, n = t.n;
+ internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
+ s << " default(" << k << ", " << m << ", " << n << ")";
+ } else {
+ s << " " << hex << b.compact_block_size << dec;
+ }
+ s << " " << b.gflops;
+ return s;
+}
+
+// We sort first by increasing benchmark parameters,
+// then by decreasing performance.
+bool operator<(const benchmark_t& b1, const benchmark_t& b2)
+{
+ return b1.compact_product_size < b2.compact_product_size ||
+ (b1.compact_product_size == b2.compact_product_size && (
+ (b1.compact_block_size < b2.compact_block_size || (
+ b1.compact_block_size == b2.compact_block_size &&
+ b1.gflops > b2.gflops))));
+}
+
+void benchmark_t::run()
+{
+ size_triple_t productsizes(compact_product_size);
+
+ if (use_default_block_size) {
+ eigen_use_specific_block_size = false;
+ } else {
+ // feed eigen with our custom blocking params
+ eigen_use_specific_block_size = true;
+ size_triple_t blocksizes(compact_block_size);
+ eigen_block_size_k = blocksizes.k;
+ eigen_block_size_m = blocksizes.m;
+ eigen_block_size_n = blocksizes.n;
+ }
+
+ // set up the matrix pool
+
+ const size_t combined_three_matrices_sizes =
+ sizeof(Scalar) *
+ (productsizes.k * productsizes.m +
+ productsizes.k * productsizes.n +
+ productsizes.m * productsizes.n);
+
+ // 64 M is large enough that nobody has a cache bigger than that,
+ // while still being small enough that everybody has this much RAM,
+ // so conveniently we don't need to special-case platforms here.
+ const size_t unlikely_large_cache_size = 64 << 20;
+
+ const size_t working_set_size =
+ min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
+
+ const size_t matrix_pool_size =
+ 1 + working_set_size / combined_three_matrices_sizes;
+
+ MatrixType *lhs = new MatrixType[matrix_pool_size];
+ MatrixType *rhs = new MatrixType[matrix_pool_size];
+ MatrixType *dst = new MatrixType[matrix_pool_size];
+
+ for (size_t i = 0; i < matrix_pool_size; i++) {
+ lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
+ rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
+ dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
+ }
+
+ // main benchmark loop
+
+ int iters_at_a_time = 1;
+ float time_per_iter = 0.0f;
+ size_t matrix_index = 0;
+ while (true) {
+
+ double starttime = timer.getCpuTime();
+ for (int i = 0; i < iters_at_a_time; i++) {
+ dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
+ matrix_index++;
+ if (matrix_index == matrix_pool_size) {
+ matrix_index = 0;
+ }
+ }
+ double endtime = timer.getCpuTime();
+
+ const float timing = float(endtime - starttime);
+
+ if (timing >= min_accurate_time) {
+ time_per_iter = timing / iters_at_a_time;
+ break;
+ }
+
+ iters_at_a_time *= 2;
+ }
+
+ delete[] lhs;
+ delete[] rhs;
+ delete[] dst;
+
+ gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
+}
+
+void print_cpuinfo()
+{
+#ifdef __linux__
+ cout << "contents of /proc/cpuinfo:" << endl;
+ string line;
+ ifstream cpuinfo("/proc/cpuinfo");
+ if (cpuinfo.is_open()) {
+ while (getline(cpuinfo, line)) {
+ cout << line << endl;
+ }
+ cpuinfo.close();
+ }
+ cout << endl;
+#elif defined __APPLE__
+ cout << "output of sysctl hw:" << endl;
+ system("sysctl hw");
+ cout << endl;
+#endif
+}
+
+template <typename T>
+string type_name()
+{
+ return "unknown";
+}
+
+template<>
+string type_name<float>()
+{
+ return "float";
+}
+
+template<>
+string type_name<double>()
+{
+ return "double";
+}
+
+struct action_t
+{
+ virtual const char* invokation_name() const { abort(); return nullptr; }
+ virtual void run() const { abort(); }
+ virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int /*argc*/, char* argv[],
+ const vector<unique_ptr<action_t>>& available_actions)
+{
+ cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+ cerr << "available actions:" << endl << endl;
+ for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+ cerr << " " << (*it)->invokation_name() << endl;
+ }
+ cerr << endl;
+ cerr << "options:" << endl << endl;
+ cerr << " --min-working-set-size=N:" << endl;
+ cerr << " Set the minimum working set size to N bytes." << endl;
+ cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
+ cerr << " A larger working set lowers the chance of a warm cache." << endl;
+ cerr << " The default value 0 means use a large enough working" << endl;
+ cerr << " set to likely outsize caches." << endl;
+ cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
+ cerr << " avoid warm caches." << endl;
+ exit(1);
+}
+
+float measure_clock_speed()
+{
+ cerr << "Measuring clock speed... \r" << flush;
+
+ vector<float> all_gflops;
+ for (int i = 0; i < 8; i++) {
+ benchmark_t b(1024, 1024, 1024);
+ b.run();
+ all_gflops.push_back(b.gflops);
+ }
+
+ sort(all_gflops.begin(), all_gflops.end());
+ float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
+
+ // multiply by an arbitrary constant to discourage trying doing anything with the
+ // returned values besides just comparing them with each other.
+ float result = stable_estimate * 123.456f;
+
+ return result;
+}
+
+struct human_duration_t
+{
+ int seconds;
+ human_duration_t(int s) : seconds(s) {}
+};
+
+ostream& operator<<(ostream& s, const human_duration_t& d)
+{
+ int remainder = d.seconds;
+ if (remainder > 3600) {
+ int hours = remainder / 3600;
+ s << hours << " h ";
+ remainder -= hours * 3600;
+ }
+ if (remainder > 60) {
+ int minutes = remainder / 60;
+ s << minutes << " min ";
+ remainder -= minutes * 60;
+ }
+ if (d.seconds < 600) {
+ s << remainder << " s";
+ }
+ return s;
+}
+
+const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
+
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
+{
+ FILE* file = fopen(filename, "w");
+ if (!file) {
+ cerr << "Could not open file " << filename << " for writing." << endl;
+ cerr << "Do you have write permissions on the current working directory?" << endl;
+ exit(1);
+ }
+ size_t benchmarks_vector_size = benchmarks.size();
+ fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
+ fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
+ fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
+ fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
+ fclose(file);
+}
+
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
+{
+ FILE* file = fopen(filename, "r");
+ if (!file) {
+ return false;
+ }
+ if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
+ return false;
+ }
+ size_t benchmarks_vector_size = 0;
+ if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
+ return false;
+ }
+ if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
+ return false;
+ }
+ benchmarks.resize(benchmarks_vector_size);
+ if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
+ return false;
+ }
+ unlink(filename);
+ return true;
+}
+
+void try_run_some_benchmarks(
+ vector<benchmark_t>& benchmarks,
+ double time_start,
+ size_t& first_benchmark_to_run)
+{
+ if (first_benchmark_to_run == benchmarks.size()) {
+ return;
+ }
+
+ double time_last_progress_update = 0;
+ double time_last_clock_speed_measurement = 0;
+ double time_now = 0;
+
+ size_t benchmark_index = first_benchmark_to_run;
+
+ while (true) {
+ float ratio_done = float(benchmark_index) / benchmarks.size();
+ time_now = timer.getRealTime();
+
+ // We check clock speed every minute and at the end.
+ if (benchmark_index == benchmarks.size() ||
+ time_now > time_last_clock_speed_measurement + 60.0f)
+ {
+ time_last_clock_speed_measurement = time_now;
+
+ // Ensure that clock speed is as expected
+ float current_clock_speed = measure_clock_speed();
+
+ // The tolerance needs to be smaller than the relative difference between
+ // clock speeds that a device could operate under.
+ // It seems unlikely that a device would be throttling clock speeds by
+ // amounts smaller than 2%.
+ // With a value of 1%, I was getting within noise on a Sandy Bridge.
+ const float clock_speed_tolerance = 0.02f;
+
+ if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
+ // Clock speed is now higher than we previously measured.
+ // Either our initial measurement was inaccurate, which won't happen
+ // too many times as we are keeping the best clock speed value and
+ // and allowing some tolerance; or something really weird happened,
+ // which invalidates all benchmark results collected so far.
+ // Either way, we better restart all over again now.
+ if (benchmark_index) {
+ cerr << "Restarting at " << 100.0f * ratio_done
+ << " % because clock speed increased. " << endl;
+ }
+ max_clock_speed = current_clock_speed;
+ first_benchmark_to_run = 0;
+ return;
+ }
+
+ bool rerun_last_tests = false;
+
+ if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+ cerr << "Measurements completed so far: "
+ << 100.0f * ratio_done
+ << " % " << endl;
+ cerr << "Clock speed seems to be only "
+ << current_clock_speed/max_clock_speed
+ << " times what it used to be." << endl;
+
+ unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
+
+ while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+ if (seconds_to_sleep_if_lower_clock_speed > 32) {
+ cerr << "Sleeping longer probably won't make a difference." << endl;
+ cerr << "Serializing benchmarks to " << session_filename << endl;
+ serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
+ cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
+ exit(2);
+ }
+ rerun_last_tests = true;
+ cerr << "Sleeping "
+ << seconds_to_sleep_if_lower_clock_speed
+ << " s... \r" << endl;
+ sleep(seconds_to_sleep_if_lower_clock_speed);
+ current_clock_speed = measure_clock_speed();
+ seconds_to_sleep_if_lower_clock_speed *= 2;
+ }
+ }
+
+ if (rerun_last_tests) {
+ cerr << "Redoing the last "
+ << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+ << " % because clock speed had been low. " << endl;
+ return;
+ }
+
+ // nothing wrong with the clock speed so far, so there won't be a need to rerun
+ // benchmarks run so far in case we later encounter a lower clock speed.
+ first_benchmark_to_run = benchmark_index;
+ }
+
+ if (benchmark_index == benchmarks.size()) {
+ // We're done!
+ first_benchmark_to_run = benchmarks.size();
+ // Erase progress info
+ cerr << " " << endl;
+ return;
+ }
+
+ // Display progress info on stderr
+ if (time_now > time_last_progress_update + 1.0f) {
+ time_last_progress_update = time_now;
+ cerr << "Measurements... " << 100.0f * ratio_done
+ << " %, ETA "
+ << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
+ << " \r" << flush;
+ }
+
+ // This is where we actually run a benchmark!
+ benchmarks[benchmark_index].run();
+ benchmark_index++;
+ }
+}
+
+void run_benchmarks(vector<benchmark_t>& benchmarks)
+{
+ size_t first_benchmark_to_run;
+ vector<benchmark_t> deserialized_benchmarks;
+ bool use_deserialized_benchmarks = false;
+ if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
+ cerr << "Found serialized session with "
+ << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+ << " % already done" << endl;
+ if (deserialized_benchmarks.size() == benchmarks.size() &&
+ first_benchmark_to_run > 0 &&
+ first_benchmark_to_run < benchmarks.size())
+ {
+ use_deserialized_benchmarks = true;
+ }
+ }
+
+ if (use_deserialized_benchmarks) {
+ benchmarks = deserialized_benchmarks;
+ } else {
+ // not using deserialized benchmarks, starting from scratch
+ first_benchmark_to_run = 0;
+
+ // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+ // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+ // It also means that if data is corrupted for some time span, the odds are that
+ // not all repetitions of a given benchmark will be corrupted.
+ random_shuffle(benchmarks.begin(), benchmarks.end());
+ }
+
+ for (int i = 0; i < 4; i++) {
+ max_clock_speed = max(max_clock_speed, measure_clock_speed());
+ }
+
+ double time_start = 0.0;
+ while (first_benchmark_to_run < benchmarks.size()) {
+ if (first_benchmark_to_run == 0) {
+ time_start = timer.getRealTime();
+ }
+ try_run_some_benchmarks(benchmarks,
+ time_start,
+ first_benchmark_to_run);
+ }
+
+ // Sort timings by increasing benchmark parameters, and decreasing gflops.
+ // The latter is very important. It means that we can ignore all but the first
+ // benchmark with given parameters.
+ sort(benchmarks.begin(), benchmarks.end());
+
+ // Collect best (i.e. now first) results for each parameter values.
+ vector<benchmark_t> best_benchmarks;
+ for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+ if (best_benchmarks.empty() ||
+ best_benchmarks.back().compact_product_size != it->compact_product_size ||
+ best_benchmarks.back().compact_block_size != it->compact_block_size)
+ {
+ best_benchmarks.push_back(*it);
+ }
+ }
+
+ // keep and return only the best benchmarks
+ benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+ virtual const char* invokation_name() const { return "all-pot-sizes"; }
+ virtual void run() const
+ {
+ vector<benchmark_t> benchmarks;
+ for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+ for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+ for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+ for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+ for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+ for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+ for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+ benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ run_benchmarks(benchmarks);
+
+ cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+ for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+ cout << *it << endl;
+ }
+ }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+ virtual const char* invokation_name() const { return "default-sizes"; }
+ virtual void run() const
+ {
+ vector<benchmark_t> benchmarks;
+ for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+ for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+ for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+ for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+ benchmarks.emplace_back(ksize, msize, nsize);
+ }
+ }
+ }
+ }
+
+ run_benchmarks(benchmarks);
+
+ cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+ for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+ cout << *it << endl;
+ }
+ }
+};
+
+int main(int argc, char* argv[])
+{
+ double time_start = timer.getRealTime();
+ cout.precision(4);
+ cerr.precision(4);
+
+ vector<unique_ptr<action_t>> available_actions;
+ available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+ available_actions.emplace_back(new measure_default_sizes_action_t);
+
+ auto action = available_actions.end();
+
+ if (argc <= 1) {
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+ for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+ if (!strcmp(argv[1], (*it)->invokation_name())) {
+ action = it;
+ break;
+ }
+ }
+
+ if (action == available_actions.end()) {
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+
+ for (int i = 2; i < argc; i++) {
+ if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+ const char* equals_sign = strchr(argv[i], '=');
+ min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+ } else {
+ cerr << "unrecognized option: " << argv[i] << endl << endl;
+ show_usage_and_exit(argc, argv, available_actions);
+ }
+ }
+
+ print_cpuinfo();
+
+ cout << "benchmark parameters:" << endl;
+ cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+ cout << "scalar type: " << type_name<Scalar>() << endl;
+ cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+ cout << "minsize = " << minsize << endl;
+ cout << "maxsize = " << maxsize << endl;
+ cout << "measurement_repetitions = " << measurement_repetitions << endl;
+ cout << "min_accurate_time = " << min_accurate_time << endl;
+ cout << "min_working_set_size = " << min_working_set_size;
+ if (min_working_set_size == 0) {
+ cout << " (try to outsize caches)";
+ }
+ cout << endl << endl;
+
+ (*action)->run();
+
+ double time_end = timer.getRealTime();
+ cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
+}
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index b299d9899..9444b450c 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -97,6 +97,7 @@ ENABLE_TESTING()
add_subdirectory(libs/eigen3)
add_subdirectory(libs/eigen2)
+add_subdirectory(libs/tensors)
add_subdirectory(libs/BLAS)
add_subdirectory(libs/ublas)
add_subdirectory(libs/gmm)
diff --git a/bench/btl/cmake/FindACML.cmake b/bench/btl/cmake/FindACML.cmake
index f45ae1b0d..4989fa2f4 100644
--- a/bench/btl/cmake/FindACML.cmake
+++ b/bench/btl/cmake/FindACML.cmake
@@ -17,6 +17,7 @@ find_file(ACML_LIBRARIES
libacml_mp.so
PATHS
/usr/lib
+ /usr/lib64
$ENV{ACMLDIR}/lib
${LIB_INSTALL_DIR}
)
@@ -35,6 +36,7 @@ if(NOT ACML_LIBRARIES)
libacml.so libacml_mv.so
PATHS
/usr/lib
+ /usr/lib64
$ENV{ACMLDIR}/lib
${LIB_INSTALL_DIR}
)
diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake
index 14b1dee09..4136a989d 100644
--- a/bench/btl/cmake/FindATLAS.cmake
+++ b/bench/btl/cmake/FindATLAS.cmake
@@ -3,18 +3,13 @@ if (ATLAS_LIBRARIES)
set(ATLAS_FIND_QUIETLY TRUE)
endif (ATLAS_LIBRARIES)
-find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LAPACK NAMES liblapack_atlas.so.3 liblapack.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LAPACK NAMES lapack_atlas lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-if(NOT ATLAS_LAPACK)
- find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
- find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-endif(NOT ATLAS_LAPACK)
-
-find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
diff --git a/bench/btl/cmake/FindCBLAS.cmake b/bench/btl/cmake/FindCBLAS.cmake
index 554f0291b..ce0f2f2b2 100644
--- a/bench/btl/cmake/FindCBLAS.cmake
+++ b/bench/btl/cmake/FindCBLAS.cmake
@@ -23,6 +23,7 @@ find_file(CBLAS_LIBRARIES
libcblas.so.3
PATHS
/usr/lib
+ /usr/lib64
$ENV{CBLASDIR}/lib
${LIB_INSTALL_DIR}
)
diff --git a/bench/btl/cmake/FindOPENBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake
index c76fc251c..2a0919436 100644
--- a/bench/btl/cmake/FindOPENBLAS.cmake
+++ b/bench/btl/cmake/FindOPENBLAS.cmake
@@ -3,7 +3,7 @@ if (OPENBLAS_LIBRARIES)
set(OPENBLAS_FIND_QUIETLY TRUE)
endif (OPENBLAS_LIBRARIES)
-find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
+find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 47fe58135..1deabdae2 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -47,7 +47,7 @@ public :
{
#if defined(EIGEN_VECTORIZE_SSE)
if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
- #elif defined(EIGEN_VECTORIZE_ALTIVEC)
+ #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
#else
if (SIZE==Dynamic) return "eigen2_novec"; else return "tiny_eigen2_novec";
diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt
new file mode 100644
index 000000000..09d6d8e43
--- /dev/null
+++ b/bench/btl/libs/tensors/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+
+if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
+ # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
+ set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
+ set(TENSOR_FOUND TRUE)
+else()
+ find_package(Tensor)
+endif()
+
+if (TENSOR_FOUND)
+
+ include_directories(${TENSOR_INCLUDE_DIR})
+ btl_add_bench(btl_tensor_linear main_linear.cpp)
+ btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
+ btl_add_bench(btl_tensor_matmat main_matmat.cpp)
+
+ btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+ btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+ btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+
+ option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
+ if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
+ btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
+ btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
+ btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
+
+ btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+ btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+ btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+ endif()
+
+
+ if(NOT BTL_NOVEC)
+ btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
+ btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
+ btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
+ btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+ btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+ btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+
+ endif(NOT BTL_NOVEC)
+
+endif (TENSOR_FOUND)
diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp
new file mode 100644
index 000000000..e257f1e72
--- /dev/null
+++ b/bench/btl/libs/tensors/main_linear.cpp
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+ bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+ bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+
+ return 0;
+}
diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp
new file mode 100644
index 000000000..675fcfc6d
--- /dev/null
+++ b/bench/btl/libs/tensors/main_matmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+ bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+
+ return 0;
+}
diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp
new file mode 100644
index 000000000..1af00c81b
--- /dev/null
+++ b/bench/btl/libs/tensors/main_vecmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+ bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+
+ return 0;
+}
diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh
new file mode 100644
index 000000000..97b8e0f0b
--- /dev/null
+++ b/bench/btl/libs/tensors/tensor_interface.hh
@@ -0,0 +1,105 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#ifndef TENSOR_INTERFACE_HH
+#define TENSOR_INTERFACE_HH
+
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+#include "btl.hh"
+
+using namespace Eigen;
+
+template<class real>
+class tensor_interface
+{
+public :
+ typedef real real_type;
+ typedef typename Eigen::Tensor<real,2>::Index Index;
+
+ typedef std::vector<real> stl_vector;
+ typedef std::vector<stl_vector> stl_matrix;
+
+ typedef Eigen::Tensor<real,2> gene_matrix;
+ typedef Eigen::Tensor<real,1> gene_vector;
+
+
+ static inline std::string name( void )
+ {
+ return EIGEN_MAKESTRING(BTL_PREFIX);
+ }
+
+ static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
+
+ static void free_vector(gene_vector & /*B*/) {}
+
+ static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
+ A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
+
+ for (unsigned int j=0; j<A_stl.size() ; j++){
+ for (unsigned int i=0; i<A_stl[j].size() ; i++){
+ A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
+ }
+ }
+ }
+
+ static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){
+ B.resize(B_stl.size());
+
+ for (unsigned int i=0; i<B_stl.size() ; i++){
+ B.coeffRef(i) = B_stl[i];
+ }
+ }
+
+ static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){
+ for (unsigned int i=0; i<B_stl.size() ; i++){
+ B_stl[i] = B.coeff(i);
+ }
+ }
+
+ static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
+ int N=A_stl.size();
+
+ for (int j=0;j<N;j++){
+ A_stl[j].resize(N);
+ for (int i=0;i<N;i++){
+ A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
+ }
+ }
+ }
+
+ static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
+ typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+ const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+ X/*.noalias()*/ = A.contract(B, dims);
+ }
+
+ static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
+ typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+ const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+ X/*.noalias()*/ = A.contract(B, dims);
+ }
+
+ static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
+ Y += X.constant(coef) * X;
+ }
+
+ static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
+ Y = X.constant(a)*X + Y.constant(b)*Y;
+ }
+
+ static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
+ cible = source;
+ }
+
+ static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
+ cible = source;
+ }
+};
+
+#endif
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
new file mode 100644
index 000000000..40a71c781
--- /dev/null
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -0,0 +1,45 @@
+#3.0.1
+#3.1.1
+#3.2.0
+3.2.4
+#5745:37f59e65eb6c
+5891:d8652709345d # introduce AVX
+#5893:24b4dc92c6d3 # merge
+5895:997c2ef9fc8b # introduce FMA
+#5904:e1eafd14eaa1 # complex and AVX
+5908:f8ee3c721251 # improve packing with ptranspose
+#5921:ca808bb456b0 # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2 # merge
+#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483 # merge
+before-evaluators
+#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f # Properly detect FMA support on ARM
+#6677:700e023044e7 # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1 # merge default to tensors
+#6726:ff2d2388e7b9 # merge default to tensors
+#6742:0cbd6195e829 # merge default to tensors
+#6747:853d2bafeb8f # Generalized the gebp apis
+6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
+#6781:9cc5a931b2c6 # generalized gemv
+#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c # merge tensor
+6845:7333ed40c6ef # change prefetching in gebp
+#6856:b5be5e10eb7f # merge index conversion
+#6893:c3a64aba7c70 # clean blocking size computation
+#6898:6fb31ebe6492 # rotating kernel for ARM
+6899:877facace746 # rotating kernel for ARM only
+#6904:c250623ae9fa # result_of
+6921:915f1b1fc158 # fix prefetching change for ARM
+6923:9ff25f6dacc6 # prefetching
+6933:52572e60b5d3 # blocking size strategy
+6937:c8c042f286b2 # avoid redundant pack_rhs
+6981:7e5d6f78da59 # dynamic loop swapping
+6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+
diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp
new file mode 100644
index 000000000..72eb9cab6
--- /dev/null
+++ b/bench/perf_monitoring/gemm/gemm.cpp
@@ -0,0 +1,67 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
+
+EIGEN_DONT_INLINE
+void gemm(const Mat &A, const Mat &B, Mat &C)
+{
+ C.noalias() += A * B;
+}
+
+EIGEN_DONT_INLINE
+double bench(long m, long n, long k)
+{
+ Mat A(m,k);
+ Mat B(k,n);
+ Mat C(m,n);
+ A.setRandom();
+ B.setRandom();
+ C.setZero();
+
+ BenchTimer t;
+
+ double up = 1e8*4/sizeof(Scalar);
+ double tm0 = 4, tm1 = 10;
+ if(NumTraits<Scalar>::IsComplex)
+ {
+ up /= 4;
+ tm0 = 2;
+ tm1 = 4;
+ }
+
+ double flops = 2. * m * n * k;
+ long rep = std::max(1., std::min(100., up/flops) );
+ long tries = std::max(tm0, std::min(tm1, up/flops) );
+
+ BENCH(t, tries, rep, gemm(A,B,C));
+
+ return 1e-9 * rep * flops / t.best();
+}
+
+int main(int argc, char **argv)
+{
+ std::vector<double> results;
+
+ std::ifstream settings("settings.txt");
+ long m, n, k;
+ while(settings >> m >> n >> k)
+ {
+ //std::cerr << " Testing " << m << " " << n << " " << k << std::endl;
+ results.push_back( bench(m, n, k) );
+ }
+
+ std::cout << RowVectorXd::Map(results.data(), results.size());
+
+ return 0;
+}
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
new file mode 100755
index 000000000..609c471f9
--- /dev/null
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# base name of the bench
+# it reads $1.out
+# and generates $1.pdf
+WHAT=$1
+
+header="rev "
+while read line
+do
+ if [ ! -z '$line' ]; then
+ header="$header \"$line\""
+ fi
+done < settings.txt
+
+echo $header > $WHAT.out.header
+cat $WHAT.out >> $WHAT.out.header
+
+
+echo "set title '$WHAT'" > $WHAT.gnuplot
+echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
+echo "set xtics rotate 1" >> $WHAT.gnuplot
+
+echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
+echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
+
+col=`cat settings.txt | wc -l`
+echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
+echo " " >> $WHAT.gnuplot
+
+gnuplot -persist < $WHAT.gnuplot
+
+# generate a png file
+# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten .$WHAT.png
+
+# clean
+rm $WHAT.out.header $WHAT.gnuplot \ No newline at end of file
diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh
new file mode 100755
index 000000000..3fa6a3661
--- /dev/null
+++ b/bench/perf_monitoring/gemm/run_gemm.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+# Examples of environment variables to be set:
+# PREFIX="haswell-fma-"
+# CXX_FLAGS="-mfma"
+
+# Options:
+# -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+# -s : recompute selected changesets only and keep bests
+
+
+if echo "$*" | grep '\-up' > /dev/null; then
+ update=true
+else
+ update=false
+fi
+
+if echo "$*" | grep '\-s' > /dev/null; then
+ selected=true
+else
+ selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
+ echo "(Re-)Compute all changesets and keep bests"
+else
+ echo "Skip previously computed changesets"
+fi
+
+
+
+if [ ! -d "eigen_src" ]; then
+ hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+ cd eigen_src
+ hg pull -u
+ cd ..
+fi
+
+if [ ! -z '$CXX' ]; then
+ CXX=g++
+fi
+
+function make_backup
+{
+ if [ -f "$1.out" ]; then
+ mv "$1.out" "$1.backup"
+ fi
+}
+
+function merge
+{
+ count1=`echo $1 | wc -w`
+ count2=`echo $2 | wc -w`
+
+ if [ $count1 == $count2 ]; then
+ a=( $1 ); b=( $2 )
+ res=""
+ for (( i=0 ; i<$count1 ; i++ )); do
+ ai=${a[$i]}; bi=${b[$i]}
+ tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
+ res="$res $tmp"
+ done
+ echo $res
+
+ else
+ echo $1
+ fi
+}
+
+function test_current
+{
+ rev=$1
+ scalar=$2
+ name=$3
+
+ prev=""
+ if [ -e "$name.backup" ]; then
+ prev=`grep $rev "$name.backup" | cut -c 14-`
+ fi
+ res=$prev
+ count_rev=`echo $prev | wc -w`
+ count_ref=`cat "settings.txt" | wc -l`
+ if echo "$global_args" | grep "$rev" > /dev/null; then
+ rev_found=true
+ else
+ rev_found=false
+ fi
+# echo $update et $selected et $rev_found because $rev et "$global_args"
+# echo $count_rev et $count_ref
+ if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then
+ if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
+ curr=`./$name`
+ if [ $count_rev == $count_ref ]; then
+ echo "merge previous $prev"
+ echo "with new $curr"
+ else
+ echo "got $curr"
+ fi
+ res=`merge "$curr" "$prev"`
+# echo $res
+ echo "$rev $res" >> $name.out
+ else
+ echo "Compilation failed, skip rev $rev"
+ fi
+ else
+ echo "Skip existing results for $rev / $name"
+ echo "$rev $res" >> $name.out
+ fi
+}
+
+make_backup $PREFIX"sgemm"
+make_backup $PREFIX"dgemm"
+make_backup $PREFIX"cgemm"
+
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
+do
+ if [ ! -z '$rev' ]; then
+ echo "Testing rev $rev"
+ cd eigen_src
+ hg up -C $rev > /dev/null
+ actual_rev=`hg identify | cut -f1 -d' '`
+ cd ..
+
+ test_current $actual_rev float $PREFIX"sgemm"
+ test_current $actual_rev double $PREFIX"dgemm"
+ test_current $actual_rev "std::complex<double>" $PREFIX"cgemm"
+ fi
+
+done
+
+echo "Float:"
+cat $PREFIX"sgemm.out"
+echo ""
+
+echo "Double:"
+cat $PREFIX"dgemm.out"
+echo ""
+
+echo "Complex:"
+cat $PREFIX"cgemm.out"
+echo ""
+
+./make_plot.sh $PREFIX"sgemm"
+./make_plot.sh $PREFIX"dgemm"
+./make_plot.sh $PREFIX"cgemm"
+
+
diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt
new file mode 100644
index 000000000..5c43e1c7d
--- /dev/null
+++ b/bench/perf_monitoring/gemm/settings.txt
@@ -0,0 +1,15 @@
+8 8 8
+9 9 9
+24 24 24
+239 239 239
+240 240 240
+2400 24 24
+24 2400 24
+24 24 2400
+24 2400 2400
+2400 24 2400
+2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
+2400 2400 2400
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
new file mode 100644
index 000000000..525b9acda
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks.h
@@ -0,0 +1,305 @@
+#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+
+typedef int TensorIndex;
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "testing/base/public/benchmark.h"
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+// TODO(bsteiner): also templatize on the input type since we have users
+// for int8 as well as floats.
+template <typename Device> class BenchmarkSuite {
+ public:
+ BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
+ : m_(m), k_(k), n_(n), device_(device) {
+ initialize();
+ }
+
+ BenchmarkSuite(const Device& device, size_t m)
+ : m_(m), k_(m), n_(m), device_(device) {
+ initialize();
+ }
+
+ ~BenchmarkSuite() {
+ device_.deallocate(a_);
+ device_.deallocate(b_);
+ device_.deallocate(c_);
+ }
+
+ void memcpy(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
+ }
+ // Record the number of values copied per second
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ void random(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = C.random();
+ }
+ // Record the number of random numbers generated per second
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ void slicing(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+ const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+ const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
+ const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
+ const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
+ const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.slice(first_quadrant, quarter_sizes).device(device_) =
+ A.slice(first_quadrant, quarter_sizes);
+ C.slice(second_quadrant, quarter_sizes).device(device_) =
+ B.slice(second_quadrant, quarter_sizes);
+ C.slice(third_quadrant, quarter_sizes).device(device_) =
+ A.slice(third_quadrant, quarter_sizes);
+ C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+ B.slice(fourth_quadrant, quarter_sizes);
+ }
+ // Record the number of values copied from the rhs slice to the lhs slice
+ // each second
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ void shuffling(int num_iters) {
+ eigen_assert(m_ == n_);
+ const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+ const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+ const Eigen::array<int, 2> shuffle(1, 0);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ B.device(device_) = A.shuffle(shuffle);
+ }
+ // Record the number of values shuffled from A and copied to B each second
+ finalizeBenchmark(m_ * k_ * num_iters);
+ }
+
+ void padding(int num_iters) {
+ eigen_assert(m_ == k_);
+ const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+ const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+ Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
+ paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
+ paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ B.device(device_) = A.pad(paddings);
+ }
+ // Record the number of values copied from the padded tensor A each second
+ finalizeBenchmark(m_ * k_ * num_iters);
+ }
+
+ void striding(int num_iters) {
+ eigen_assert(m_ == k_);
+ const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+ const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+ const Eigen::array<TensorIndex, 2> strides(1, 2);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ B.device(device_) = A.stride(strides);
+ }
+ // Record the number of values copied from the padded tensor A each second
+ finalizeBenchmark(m_ * k_ * num_iters);
+ }
+
+ void broadcasting(int num_iters) {
+ const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+ const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
+
+#if defined(__CUDACC__)
+ // nvcc doesn't support cxx11
+ const Eigen::array<int, 2> broadcast(1, n_);
+#else
+ // Take advantage of cxx11 to give the compiler information it can use to
+ // optimize the code.
+ Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
+ broadcast.set(1, n_);
+#endif
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A.broadcast(broadcast);
+ }
+ // Record the number of values broadcasted from A and copied to C each second
+ finalizeBenchmark(m_ * n_ * num_iters);
+ }
+
+ void coeffWiseOp(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
+ }
+ // Record the number of FLOP executed per second (2 multiplications and
+ // 1 addition per value)
+ finalizeBenchmark(3 * m_ * m_ * num_iters);
+ }
+
+ void algebraicFunc(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ void transcendentalFunc(int num_iters) {
+ eigen_assert(m_ == k_ && k_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A.exp() + B.log();
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ // Simple reduction
+ void reduction(int num_iters) {
+ const Eigen::array<TensorIndex, 2> input_size(k_, n_);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
+ const Eigen::array<TensorIndex, 1> output_size(n_);
+ TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+
+ const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.sum(sum_along_dim);
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(m_ * m_ * num_iters);
+ }
+
+ // do a contraction which is equivalent to a matrix multiplication
+ void contraction(int num_iters) {
+ const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
+ const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
+ const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
+ const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
+
+ typedef typename Tensor<float, 2>::DimensionPair DimPair;
+ const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A.contract(B, dims);
+ }
+ // Record the number of FLOP executed per second (size_ multiplications and
+ // additions for each value in the resulting tensor)
+ finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+ }
+
+ void convolution(int num_iters, int kernel_x, int kernel_y) {
+ const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
+ const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+ const Eigen::array<TensorIndex, 2> result_sizes(
+ m_ - kernel_x + 1, n_ - kernel_y + 1);
+ TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
+ Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = A.convolve(B, dims);
+ }
+ // Record the number of FLOP executed per second (kernel_size
+ // multiplications and additions for each value in the resulting tensor)
+ finalizeBenchmark(
+ (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+ }
+
+ private:
+ void initialize() {
+ a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
+ b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
+ c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
+
+ // Initialize the content of the memory pools to prevent asan from
+ // complaining.
+ device_.memset(a_, 12, m_ * k_ * sizeof(float));
+ device_.memset(b_, 23, k_ * n_ * sizeof(float));
+ device_.memset(c_, 31, m_ * n_ * sizeof(float));
+
+ BenchmarkUseRealTime();
+ }
+
+ inline void finalizeBenchmark(int64 num_items) {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+ if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+ device_.synchronize();
+ }
+#endif
+ StopBenchmarkTiming();
+ SetBenchmarkItemsProcessed(num_items);
+ }
+
+
+ size_t m_;
+ size_t k_;
+ size_t n_;
+ float* a_;
+ float* b_;
+ float* c_;
+ Device device_;
+};
+#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
new file mode 100644
index 000000000..68653ba15
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -0,0 +1,156 @@
+#define EIGEN_USE_THREADS
+
+#include "base/sysinfo.h"
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+#include "thread/threadpool.h"
+
+#ifdef __ANDROID__
+#define CREATE_THREAD_POOL(threads) \
+Eigen::ThreadPoolDevice device(threads);
+#else
+#define CREATE_THREAD_POOL(threads) \
+ThreadPool tp(threads); \
+tp.StartWorkers(); \
+Eigen::ThreadPoolDevice device(&tp, threads);
+#endif
+
+// Simple functions
+#define BM_FuncCPU(FUNC, THREADS) \
+ static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ CREATE_THREAD_POOL(THREADS); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
+ suite.FUNC(iters); \
+ SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
+
+BM_FuncCPU(memcpy, 4);
+BM_FuncCPU(memcpy, 8);
+BM_FuncCPU(memcpy, 12);
+
+BM_FuncCPU(random, 4);
+BM_FuncCPU(random, 8);
+BM_FuncCPU(random, 12);
+
+BM_FuncCPU(slicing, 4);
+BM_FuncCPU(slicing, 8);
+BM_FuncCPU(slicing, 12);
+
+BM_FuncCPU(shuffling, 4);
+BM_FuncCPU(shuffling, 8);
+BM_FuncCPU(shuffling, 12);
+
+BM_FuncCPU(padding, 4);
+BM_FuncCPU(padding, 8);
+BM_FuncCPU(padding, 12);
+
+BM_FuncCPU(striding, 4);
+BM_FuncCPU(striding, 8);
+BM_FuncCPU(striding, 12);
+
+BM_FuncCPU(broadcasting, 4);
+BM_FuncCPU(broadcasting, 8);
+BM_FuncCPU(broadcasting, 12);
+
+BM_FuncCPU(coeffWiseOp, 4);
+BM_FuncCPU(coeffWiseOp, 8);
+BM_FuncCPU(coeffWiseOp, 12);
+
+BM_FuncCPU(algebraicFunc, 4);
+BM_FuncCPU(algebraicFunc, 8);
+BM_FuncCPU(algebraicFunc, 12);
+
+BM_FuncCPU(transcendentalFunc, 4);
+BM_FuncCPU(transcendentalFunc, 8);
+BM_FuncCPU(transcendentalFunc, 12);
+
+BM_FuncCPU(reduction, 4);
+BM_FuncCPU(reduction, 8);
+BM_FuncCPU(reduction, 12);
+
+
+// Contractions
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
+ static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
+ StopBenchmarkTiming(); \
+ if (THREADS == 1) { \
+ Eigen::DefaultDevice device; \
+ BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
+ suite.FUNC(iters); \
+ } else { \
+ CREATE_THREAD_POOL(THREADS); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
+ suite.FUNC(iters); \
+ } \
+ SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
+
+
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
+ static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ CREATE_THREAD_POOL(THREADS); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
+ suite.FUNC(iters, DIM1, DIM2); \
+ SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc
new file mode 100644
index 000000000..adea754ad
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_gpu.cc
@@ -0,0 +1,75 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+
+
+
+// Simple functions
+#define BM_FuncGPU(FUNC) \
+ static void BM_##FUNC(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ cudaStream_t stream; \
+ cudaStreamCreate(&stream); \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters); \
+ cudaStreamDestroy(stream); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(reduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
+ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ cudaStream_t stream; \
+ cudaStreamCreate(&stream); \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters); \
+ cudaStreamDestroy(stream); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
+ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ cudaStream_t stream; \
+ cudaStreamCreate(&stream); \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters, DIM1, DIM2); \
+ cudaStreamDestroy(stream); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);