diff options
Diffstat (limited to 'bench')
23 files changed, 2662 insertions, 27 deletions
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp new file mode 100644 index 000000000..d563a1d2d --- /dev/null +++ b/bench/analyze-blocking-sizes.cpp @@ -0,0 +1,876 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include <iostream> +#include <cstdint> +#include <cstdlib> +#include <vector> +#include <algorithm> +#include <fstream> +#include <string> +#include <cmath> +#include <cassert> +#include <cstring> +#include <memory> + +#include <Eigen/Core> + +using namespace std; + +const int default_precision = 4; + +// see --only-cubic-sizes +bool only_cubic_sizes = false; + +// see --dump-tables +bool dump_tables = false; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + uint16_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } + bool is_cubic() const { return k == m && m == n; } +}; + +ostream& operator<<(ostream& s, const size_triple_t& t) +{ + return s << "(" << t.k << ", " << t.m << ", " << t.n << ")"; +} + +struct inputfile_entry_t +{ + uint16_t product_size; + uint16_t pot_block_size; + size_triple_t nonpot_block_size; + float gflops; +}; + +struct inputfile_t +{ + enum class type_t { + unknown, + all_pot_sizes, + default_sizes + }; + + string filename; + vector<inputfile_entry_t> entries; + type_t type; + + inputfile_t(const string& fname) + : filename(fname) + , type(type_t::unknown) + { + ifstream stream(filename); + if (!stream.is_open()) { + cerr << "couldn't open input file: " << filename << endl; + exit(1); + } + string line; + while (getline(stream, line)) { + if (line.empty()) continue; + if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::all_pot_sizes; + continue; + } + if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::default_sizes; + continue; + } + + + if (type == type_t::unknown) { + continue; + } + switch(type) { + case type_t::all_pot_sizes: { + unsigned int product_size, block_size; + float gflops; + int sscanf_result = + sscanf(line.c_str(), "%x %x %f", + &product_size, + &block_size, + &gflops); + if (3 != sscanf_result || + !product_size || + product_size > 0xfff || + !block_size || + block_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = uint16_t(block_size); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + case type_t::default_sizes: { + unsigned int product_size; + float gflops; + int bk, bm, bn; + int sscanf_result = + sscanf(line.c_str(), "%x default(%d, %d, %d) %f", + &product_size, + &bk, &bm, &bn, + &gflops); + if (5 != sscanf_result || + !product_size || + product_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = 0; + entry.nonpot_block_size = size_triple_t(bk, bm, bn); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + + default: + break; + } + } + stream.close(); + if (type == type_t::unknown) { + cerr << "Unrecognized input file " << filename << endl; + exit(1); + } + if (entries.empty()) { + cerr << "didn't find any measurements in input file: " << filename << endl; + exit(1); + } + } +}; + +struct preprocessed_inputfile_entry_t +{ + uint16_t product_size; + uint16_t block_size; + + float efficiency; +}; + +bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2) +{ + return e1.efficiency < e2.efficiency; +} + +struct preprocessed_inputfile_t +{ + string filename; + vector<preprocessed_inputfile_entry_t> entries; + + preprocessed_inputfile_t(const inputfile_t& inputfile) + : filename(inputfile.filename) + { + if (inputfile.type != inputfile_t::type_t::all_pot_sizes) { + abort(); + } + auto it = inputfile.entries.begin(); + auto it_first_with_given_product_size = it; + while (it != inputfile.entries.end()) { + ++it; + if (it == inputfile.entries.end() || + it->product_size != it_first_with_given_product_size->product_size) + { + import_input_file_range_one_product_size(it_first_with_given_product_size, it); + it_first_with_given_product_size = it; + } + } + } + +private: + void import_input_file_range_one_product_size( + const vector<inputfile_entry_t>::const_iterator& begin, + const vector<inputfile_entry_t>::const_iterator& end) + { + uint16_t product_size = begin->product_size; + float max_gflops = 0.0f; + for (auto it = begin; it != end; ++it) { + if (it->product_size != product_size) { + cerr << "Unexpected ordering of entries in " << filename << endl; + cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl; + exit(1); + } + max_gflops = max(max_gflops, it->gflops); + } + for (auto it = begin; it != end; ++it) { + preprocessed_inputfile_entry_t entry; + entry.product_size = it->product_size; + entry.block_size = it->pot_block_size; + entry.efficiency = it->gflops / max_gflops; + entries.push_back(entry); + } + } +}; + +void check_all_files_in_same_exact_order( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) +{ + if (preprocessed_inputfiles.empty()) { + return; + } + + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0]; + const size_t num_entries = first_file.entries.size(); + + for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) { + if (preprocessed_inputfiles[i].entries.size() != num_entries) { + cerr << "these files have different number of entries: " + << preprocessed_inputfiles[i].filename + << " and " + << first_file.filename + << endl; + exit(1); + } + } + + for (size_t entry_index = 0; entry_index < num_entries; entry_index++) { + const uint16_t entry_product_size = first_file.entries[entry_index].product_size; + const uint16_t entry_block_size = first_file.entries[entry_index].block_size; + for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) { + const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index]; + if (cur_file.entries[entry_index].product_size != entry_product_size || + cur_file.entries[entry_index].block_size != entry_block_size) + { + cerr << "entries not in same order between these files: " + << first_file.filename + << " and " + << cur_file.filename + << endl; + exit(1); + } + } + } +} + +float efficiency_of_subset( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + const vector<size_t>& subset) +{ + if (subset.size() <= 1) { + return 1.0f; + } + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]]; + const size_t num_entries = first_file.entries.size(); + float efficiency = 1.0f; + size_t entry_index = 0; + size_t first_entry_index_with_this_product_size = 0; + uint16_t product_size = first_file.entries[0].product_size; + while (entry_index < num_entries) { + ++entry_index; + if (entry_index == num_entries || + first_file.entries[entry_index].product_size != product_size) + { + float efficiency_this_product_size = 0.0f; + for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) { + float efficiency_this_entry = 1.0f; + for (auto i = subset.begin(); i != subset.end(); ++i) { + efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency); + } + efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry); + } + efficiency = min(efficiency, efficiency_this_product_size); + if (entry_index < num_entries) { + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } + } + } + + return efficiency; +} + +void dump_table_for_subset( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + const vector<size_t>& subset) +{ + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]]; + const size_t num_entries = first_file.entries.size(); + size_t entry_index = 0; + size_t first_entry_index_with_this_product_size = 0; + uint16_t product_size = first_file.entries[0].product_size; + size_t i = 0; + size_triple_t min_product_size(first_file.entries.front().product_size); + size_triple_t max_product_size(first_file.entries.back().product_size); + if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) { + abort(); + } + if (only_cubic_sizes) { + cerr << "Can't generate tables with --only-cubic-sizes." << endl; + abort(); + } + cout << "struct LookupTable {" << endl; + cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl; + const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1; + const size_t TableSize = NumSizes * NumSizes * NumSizes; + cout << " static const size_t NumSizes = " << NumSizes << ";" << endl; + cout << " static const unsigned short* Data() {" << endl; + cout << " static const unsigned short data[" << TableSize << "] = {"; + while (entry_index < num_entries) { + ++entry_index; + if (entry_index == num_entries || + first_file.entries[entry_index].product_size != product_size) + { + float best_efficiency_this_product_size = 0.0f; + uint16_t best_block_size_this_product_size = 0; + for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) { + float efficiency_this_entry = 1.0f; + for (auto i = subset.begin(); i != subset.end(); ++i) { + efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency); + } + if (efficiency_this_entry > best_efficiency_this_product_size) { + best_efficiency_this_product_size = efficiency_this_entry; + best_block_size_this_product_size = first_file.entries[e].block_size; + } + } + if ((i++) % NumSizes) { + cout << " "; + } else { + cout << endl << " "; + } + cout << "0x" << hex << best_block_size_this_product_size << dec; + if (entry_index < num_entries) { + cout << ","; + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } + } + } + if (i != TableSize) { + cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl; + abort(); + } + cout << endl << " };" << endl; + cout << " return data;" << endl; + cout << " }" << endl; + cout << "};" << endl; +} + +float efficiency_of_partition( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + const vector<vector<size_t>>& partition) +{ + float efficiency = 1.0f; + for (auto s = partition.begin(); s != partition.end(); ++s) { + efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s)); + } + return efficiency; +} + +void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size) +{ + assert(subset_size >= 1 && subset_size <= set_size); + out_subset.resize(subset_size); + for (size_t i = 0; i < subset_size; i++) { + out_subset[i] = i; + } +} + +bool is_last_subset(const vector<size_t>& subset, size_t set_size) +{ + return subset[0] == set_size - subset.size(); +} + +void next_subset(vector<size_t>& inout_subset, size_t set_size) +{ + if (is_last_subset(inout_subset, set_size)) { + cerr << "iterating past the last subset" << endl; + abort(); + } + size_t i = 1; + while (inout_subset[inout_subset.size() - i] == set_size - i) { + i++; + assert(i <= inout_subset.size()); + } + size_t first_index_to_change = inout_subset.size() - i; + inout_subset[first_index_to_change]++; + size_t p = inout_subset[first_index_to_change]; + for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) { + inout_subset[j] = ++p; + } +} + +const size_t number_of_subsets_limit = 100; +const size_t always_search_subsets_of_size_at_least = 2; + +bool is_number_of_subsets_feasible(size_t n, size_t p) +{ + assert(n>0 && p>0 && p<=n); + uint64_t numerator = 1, denominator = 1; + for (size_t i = 0; i < p; i++) { + numerator *= n - i; + denominator *= i + 1; + if (numerator > denominator * number_of_subsets_limit) { + return false; + } + } + return true; +} + +size_t max_feasible_subset_size(size_t n) +{ + assert(n > 0); + const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least); + for (size_t p = 1; p <= n - 1; p++) { + if (!is_number_of_subsets_feasible(n, p+1)) { + return max(p, minresult); + } + } + return n - 1; +} + +void find_subset_with_efficiency_higher_than( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + float required_efficiency_to_beat, + vector<size_t>& inout_remainder, + vector<size_t>& out_subset) +{ + out_subset.resize(0); + + if (required_efficiency_to_beat >= 1.0f) { + cerr << "can't beat efficiency 1." << endl; + abort(); + } + + while (!inout_remainder.empty()) { + + vector<size_t> candidate_indices(inout_remainder.size()); + for (size_t i = 0; i < candidate_indices.size(); i++) { + candidate_indices[i] = i; + } + + size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size()); + while (candidate_indices_subset_size >= 1) { + vector<size_t> candidate_indices_subset; + make_first_subset(candidate_indices_subset_size, + candidate_indices_subset, + candidate_indices.size()); + + vector<size_t> best_candidate_indices_subset; + float best_efficiency = 0.0f; + vector<size_t> trial_subset = out_subset; + trial_subset.resize(out_subset.size() + candidate_indices_subset_size); + while (true) + { + for (size_t i = 0; i < candidate_indices_subset_size; i++) { + trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]]; + } + + float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset); + if (trial_efficiency > best_efficiency) { + best_efficiency = trial_efficiency; + best_candidate_indices_subset = candidate_indices_subset; + } + if (is_last_subset(candidate_indices_subset, candidate_indices.size())) { + break; + } + next_subset(candidate_indices_subset, candidate_indices.size()); + } + + if (best_efficiency > required_efficiency_to_beat) { + for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) { + candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]]; + } + candidate_indices.resize(best_candidate_indices_subset.size()); + } + candidate_indices_subset_size--; + } + + size_t candidate_index = candidate_indices[0]; + auto candidate_iterator = inout_remainder.begin() + candidate_index; + vector<size_t> trial_subset = out_subset; + + trial_subset.push_back(*candidate_iterator); + float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset); + if (trial_efficiency > required_efficiency_to_beat) { + out_subset.push_back(*candidate_iterator); + inout_remainder.erase(candidate_iterator); + } else { + break; + } + } +} + +void find_partition_with_efficiency_higher_than( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + float required_efficiency_to_beat, + vector<vector<size_t>>& out_partition) +{ + out_partition.resize(0); + + vector<size_t> remainder; + for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) { + remainder.push_back(i); + } + + while (!remainder.empty()) { + vector<size_t> new_subset; + find_subset_with_efficiency_higher_than( + preprocessed_inputfiles, + required_efficiency_to_beat, + remainder, + new_subset); + out_partition.push_back(new_subset); + } +} + +void print_partition( + const vector<preprocessed_inputfile_t>& preprocessed_inputfiles, + const vector<vector<size_t>>& partition) +{ + float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); + cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency" << endl; + for (auto subset = partition.begin(); subset != partition.end(); ++subset) { + cout << " Subset " << (subset - partition.begin()) + << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:" + << endl; + for (auto file = subset->begin(); file != subset->end(); ++file) { + cout << " " << preprocessed_inputfiles[*file].filename << endl; + } + if (dump_tables) { + cout << " Table:" << endl; + dump_table_for_subset(preprocessed_inputfiles, *subset); + } + } + cout << endl; +} + +struct action_t +{ + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run(const vector<string>&) const { abort(); } + virtual ~action_t() {} +}; + +struct partition_action_t : action_t +{ + virtual const char* invokation_name() const override { return "partition"; } + virtual void run(const vector<string>& input_filenames) const override + { + vector<preprocessed_inputfile_t> preprocessed_inputfiles; + + if (input_filenames.empty()) { + cerr << "The " << invokation_name() << " action needs a list of input files." << endl; + exit(1); + } + + for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) { + inputfile_t inputfile(*it); + switch (inputfile.type) { + case inputfile_t::type_t::all_pot_sizes: + preprocessed_inputfiles.emplace_back(inputfile); + break; + case inputfile_t::type_t::default_sizes: + cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and " + << "has no use for " << *it << " which contains measurements for default sizes." << endl; + exit(1); + break; + default: + cerr << "Unrecognized input file: " << *it << endl; + exit(1); + } + } + + check_all_files_in_same_exact_order(preprocessed_inputfiles); + + float required_efficiency_to_beat = 0.0f; + vector<vector<vector<size_t>>> partitions; + cerr << "searching for partitions...\r" << flush; + while (true) + { + vector<vector<size_t>> partition; + find_partition_with_efficiency_higher_than( + preprocessed_inputfiles, + required_efficiency_to_beat, + partition); + float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); + cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size() + << " subsets for " << 100.0f * actual_efficiency + << " % efficiency" + << " \r" << flush; + partitions.push_back(partition); + if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) { + break; + } + required_efficiency_to_beat = actual_efficiency; + } + cerr << " " << endl; + while (true) { + bool repeat = false; + for (size_t i = 0; i < partitions.size() - 1; i++) { + if (partitions[i].size() >= partitions[i+1].size()) { + partitions.erase(partitions.begin() + i); + repeat = true; + break; + } + } + if (!repeat) { + break; + } + } + for (auto it = partitions.begin(); it != partitions.end(); ++it) { + print_partition(preprocessed_inputfiles, *it); + } + } +}; + +struct evaluate_defaults_action_t : action_t +{ + struct results_entry_t { + uint16_t product_size; + size_triple_t default_block_size; + uint16_t best_pot_block_size; + float default_gflops; + float best_pot_gflops; + float default_efficiency; + }; + friend ostream& operator<<(ostream& s, const results_entry_t& entry) + { + return s + << "Product size " << size_triple_t(entry.product_size) + << ": default block size " << entry.default_block_size + << " -> " << entry.default_gflops + << " GFlop/s = " << entry.default_efficiency * 100.0f << " %" + << " of best POT block size " << size_triple_t(entry.best_pot_block_size) + << " -> " << entry.best_pot_gflops + << " GFlop/s" << dec; + } + static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) { + return e1.default_efficiency < e2.default_efficiency; + } + virtual const char* invokation_name() const override { return "evaluate-defaults"; } + void show_usage_and_exit() const + { + cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl; + cerr << "checks how well the performance with default sizes compares to the best " + << "performance measured over all POT sizes." << endl; + exit(1); + } + virtual void run(const vector<string>& input_filenames) const override + { + if (input_filenames.size() != 2) { + show_usage_and_exit(); + } + inputfile_t inputfile_default_sizes(input_filenames[0]); + inputfile_t inputfile_all_pot_sizes(input_filenames[1]); + if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) { + cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl; + show_usage_and_exit(); + } + if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) { + cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl; + show_usage_and_exit(); + } + vector<results_entry_t> results; + vector<results_entry_t> cubic_results; + + uint16_t product_size = 0; + auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin(); + for (auto it_default_sizes = inputfile_default_sizes.entries.begin(); + it_default_sizes != inputfile_default_sizes.entries.end(); + ++it_default_sizes) + { + if (it_default_sizes->product_size == product_size) { + continue; + } + product_size = it_default_sizes->product_size; + while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() && + it_all_pot_sizes->product_size != product_size) + { + ++it_all_pot_sizes; + } + if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) { + break; + } + uint16_t best_pot_block_size = 0; + float best_pot_gflops = 0; + for (auto it = it_all_pot_sizes; + it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size; + ++it) + { + if (it->gflops > best_pot_gflops) { + best_pot_gflops = it->gflops; + best_pot_block_size = it->pot_block_size; + } + } + results_entry_t entry; + entry.product_size = product_size; + entry.default_block_size = it_default_sizes->nonpot_block_size; + entry.best_pot_block_size = best_pot_block_size; + entry.default_gflops = it_default_sizes->gflops; + entry.best_pot_gflops = best_pot_gflops; + entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops; + results.push_back(entry); + + size_triple_t t(product_size); + if (t.k == t.m && t.m == t.n) { + cubic_results.push_back(entry); + } + } + + cout << "All results:" << endl; + for (auto it = results.begin(); it != results.end(); ++it) { + cout << *it << endl; + } + cout << endl; + + sort(results.begin(), results.end(), lower_efficiency); + + const size_t n = min<size_t>(20, results.size()); + cout << n << " worst results:" << endl; + for (size_t i = 0; i < n; i++) { + cout << results[i] << endl; + } + cout << endl; + + cout << "cubic results:" << endl; + for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) { + cout << *it << endl; + } + cout << endl; + + sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); + + cout.precision(2); + vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f}; + for (auto it = a.begin(); it != a.end(); ++it) { + size_t n = min(results.size() - 1, size_t(*it * results.size())); + cout << (100.0f * n / (results.size() - 1)) + << " % of product sizes have default efficiency <= " + << 100.0f * results[n].default_efficiency << " %" << endl; + } + cout.precision(default_precision); + } +}; + + +void show_usage_and_exit(int argc, char* argv[], + const vector<unique_ptr<action_t>>& available_actions) +{ + cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl; + cerr << "available actions:" << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl; + exit(1); +} + +int main(int argc, char* argv[]) +{ + cout.precision(default_precision); + cerr.precision(default_precision); + + vector<unique_ptr<action_t>> available_actions; + available_actions.emplace_back(new partition_action_t); + available_actions.emplace_back(new evaluate_defaults_action_t); + + vector<string> input_filenames; + + action_t* action = nullptr; + + if (argc < 2) { + show_usage_and_exit(argc, argv, available_actions); + } + for (int i = 1; i < argc; i++) { + bool arg_handled = false; + // Step 1. Try to match action invokation names. + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[i], (*it)->invokation_name())) { + if (!action) { + action = it->get(); + arg_handled = true; + break; + } else { + cerr << "can't specify more than one action!" << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + } + if (arg_handled) { + continue; + } + // Step 2. Try to match option names. + if (argv[i][0] == '-') { + if (!strcmp(argv[i], "--only-cubic-sizes")) { + only_cubic_sizes = true; + arg_handled = true; + } + if (!strcmp(argv[i], "--dump-tables")) { + dump_tables = true; + arg_handled = true; + } + if (!arg_handled) { + cerr << "Unrecognized option: " << argv[i] << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + if (arg_handled) { + continue; + } + // Step 3. Default to interpreting args as input filenames. + input_filenames.emplace_back(argv[i]); + } + + if (dump_tables && only_cubic_sizes) { + cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl; + show_usage_and_exit(argc, argv, available_actions); + } + + if (!action) { + show_usage_and_exit(argc, argv, available_actions); + } + + action->run(input_filenames); +} diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 8222271fb..0974ebe4c 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -148,7 +148,7 @@ int main(int argc, char ** argv) int m = s; int n = s; int p = s; - int cache_size = -1; + int cache_size1=-1, cache_size2=l2, cache_size3 = 0; bool need_help = false; for (int i=1; i<argc;) @@ -169,7 +169,13 @@ int main(int argc, char ** argv) else if(argv[i][1]=='c') { ++i; - cache_size = atoi(argv[i++]); + cache_size1 = atoi(argv[i++]); + if(argv[i][0]!='-') + { + cache_size2 = atoi(argv[i++]); + if(argv[i][0]!='-') + cache_size3 = atoi(argv[i++]); + } } else if(argv[i][1]=='t') { @@ -191,14 +197,14 @@ int main(int argc, char ** argv) if(need_help) { - std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n"; + std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n"; std::cout << " <matrix sizes> : size\n"; std::cout << " <matrix sizes> : rows columns depth\n"; return 1; } - if(cache_size>0) - setCpuCacheSizes(cache_size,96*cache_size); + if(cache_size1>0) + setCpuCacheSizes(cache_size1,cache_size2,cache_size3); A a(m,p); a.setRandom(); diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp index 398fef835..129afcfb2 100644 --- a/bench/bench_norm.cpp +++ b/bench/bench_norm.cpp @@ -6,19 +6,25 @@ using namespace Eigen; using namespace std; template<typename T> -EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v) { return v.norm(); } template<typename T> -EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v) +{ + return v.stableNorm(); +} + +template<typename T> +EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v) { return v.hypotNorm(); } template<typename T> -EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v) +EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v) { return v.blueNorm(); } @@ -217,20 +223,21 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) } #define BENCH_PERF(NRM) { \ + float af = 0; double ad = 0; std::complex<float> ac = 0; \ Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\ for (int k=0; k<tries; ++k) { \ tf.start(); \ - for (int i=0; i<iters; ++i) NRM(vf); \ + for (int i=0; i<iters; ++i) { af += NRM(vf); } \ tf.stop(); \ } \ for (int k=0; k<tries; ++k) { \ td.start(); \ - for (int i=0; i<iters; ++i) NRM(vd); \ + for (int i=0; i<iters; ++i) { ad += NRM(vd); } \ td.stop(); \ } \ /*for (int k=0; k<std::max(1,tries/3); ++k) { \ tcf.start(); \ - for (int i=0; i<iters; ++i) NRM(vcf); \ + for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \ tcf.stop(); \ } */\ std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \ @@ -316,14 +323,17 @@ int main(int argc, char** argv) std::cout << "\n"; } + y = 1; std::cout.precision(4); - std::cerr << "Performance (out of cache):\n"; + int s1 = 1024*1024*32; + std::cerr << "Performance (out of cache, " << s1 << "):\n"; { int iters = 1; - VectorXf vf = VectorXf::Random(1024*1024*32) * y; - VectorXd vd = VectorXd::Random(1024*1024*32) * y; - VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y; + VectorXf vf = VectorXf::Random(s1) * y; + VectorXd vd = VectorXd::Random(s1) * y; + VectorXcf vcf = VectorXcf::Random(s1) * y; BENCH_PERF(sqsumNorm); + BENCH_PERF(stableNorm); BENCH_PERF(blueNorm); BENCH_PERF(pblueNorm); BENCH_PERF(lapackNorm); @@ -332,13 +342,14 @@ int main(int argc, char** argv) BENCH_PERF(bl2passNorm); } - std::cerr << "\nPerformance (in cache):\n"; + std::cerr << "\nPerformance (in cache, " << 512 << "):\n"; { int iters = 100000; VectorXf vf = VectorXf::Random(512) * y; VectorXd vd = VectorXd::Random(512) * y; VectorXcf vcf = VectorXcf::Random(512) * y; BENCH_PERF(sqsumNorm); + BENCH_PERF(stableNorm); BENCH_PERF(blueNorm); BENCH_PERF(pblueNorm); BENCH_PERF(lapackNorm); diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp new file mode 100644 index 000000000..827be2880 --- /dev/null +++ b/bench/benchmark-blocking-sizes.cpp @@ -0,0 +1,677 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include <iostream> +#include <cstdint> +#include <cstdlib> +#include <vector> +#include <fstream> +#include <memory> +#include <cstdio> + +bool eigen_use_specific_block_size; +int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n +#include <Eigen/Core> + +#include <bench/BenchTimer.h> + +using namespace Eigen; +using namespace std; + +static BenchTimer timer; + +// how many times we repeat each measurement. +// measurements are randomly shuffled - we're not doing +// all N identical measurements in a row. +const int measurement_repetitions = 3; + +// Timings below this value are too short to be accurate, +// we'll repeat measurements with more iterations until +// we get a timing above that threshold. +const float min_accurate_time = 1e-2f; + +// See --min-working-set-size command line parameter. +size_t min_working_set_size = 0; + +float max_clock_speed = 0.0f; + +// range of sizes that we will benchmark (in all 3 K,M,N dimensions) +const size_t maxsize = 2048; +const size_t minsize = 16; + +typedef MatrixXf MatrixType; +typedef MatrixType::Scalar Scalar; +typedef internal::packet_traits<Scalar>::type Packet; + +static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); +static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); +static_assert(maxsize > minsize, "maxsize must be larger than minsize"); +static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)"); + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + size_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } +}; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +// Convert between size tripes and a compact form fitting in 12 bits +// where each size, which must be a POT, is encoded as its log2, on 4 bits +// so the largest representable size is 2^15 == 32k ... big enough. +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +uint16_t compact_size_triple(const size_triple_t& t) +{ + return compact_size_triple(t.k, t.m, t.n); +} + +// A single benchmark. Initially only contains benchmark params. +// Then call run(), which stores the result in the gflops field. +struct benchmark_t +{ + uint16_t compact_product_size; + uint16_t compact_block_size; + bool use_default_block_size; + float gflops; + benchmark_t() + : compact_product_size(0) + , compact_block_size(0) + , use_default_block_size(false) + , gflops(0) + { + } + benchmark_t(size_t pk, size_t pm, size_t pn, + size_t bk, size_t bm, size_t bn) + : compact_product_size(compact_size_triple(pk, pm, pn)) + , compact_block_size(compact_size_triple(bk, bm, bn)) + , use_default_block_size(false) + , gflops(0) + {} + benchmark_t(size_t pk, size_t pm, size_t pn) + : compact_product_size(compact_size_triple(pk, pm, pn)) + , compact_block_size(0) + , use_default_block_size(true) + , gflops(0) + {} + + void run(); +}; + +ostream& operator<<(ostream& s, const benchmark_t& b) +{ + s << hex << b.compact_product_size << dec; + if (b.use_default_block_size) { + size_triple_t t(b.compact_product_size); + Index k = t.k, m = t.m, n = t.n; + internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n); + s << " default(" << k << ", " << m << ", " << n << ")"; + } else { + s << " " << hex << b.compact_block_size << dec; + } + s << " " << b.gflops; + return s; +} + +// We sort first by increasing benchmark parameters, +// then by decreasing performance. +bool operator<(const benchmark_t& b1, const benchmark_t& b2) +{ + return b1.compact_product_size < b2.compact_product_size || + (b1.compact_product_size == b2.compact_product_size && ( + (b1.compact_block_size < b2.compact_block_size || ( + b1.compact_block_size == b2.compact_block_size && + b1.gflops > b2.gflops)))); +} + +void benchmark_t::run() +{ + size_triple_t productsizes(compact_product_size); + + if (use_default_block_size) { + eigen_use_specific_block_size = false; + } else { + // feed eigen with our custom blocking params + eigen_use_specific_block_size = true; + size_triple_t blocksizes(compact_block_size); + eigen_block_size_k = blocksizes.k; + eigen_block_size_m = blocksizes.m; + eigen_block_size_n = blocksizes.n; + } + + // set up the matrix pool + + const size_t combined_three_matrices_sizes = + sizeof(Scalar) * + (productsizes.k * productsizes.m + + productsizes.k * productsizes.n + + productsizes.m * productsizes.n); + + // 64 M is large enough that nobody has a cache bigger than that, + // while still being small enough that everybody has this much RAM, + // so conveniently we don't need to special-case platforms here. + const size_t unlikely_large_cache_size = 64 << 20; + + const size_t working_set_size = + min_working_set_size ? min_working_set_size : unlikely_large_cache_size; + + const size_t matrix_pool_size = + 1 + working_set_size / combined_three_matrices_sizes; + + MatrixType *lhs = new MatrixType[matrix_pool_size]; + MatrixType *rhs = new MatrixType[matrix_pool_size]; + MatrixType *dst = new MatrixType[matrix_pool_size]; + + for (size_t i = 0; i < matrix_pool_size; i++) { + lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k); + rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n); + dst[i] = MatrixType::Zero(productsizes.m, productsizes.n); + } + + // main benchmark loop + + int iters_at_a_time = 1; + float time_per_iter = 0.0f; + size_t matrix_index = 0; + while (true) { + + double starttime = timer.getCpuTime(); + for (int i = 0; i < iters_at_a_time; i++) { + dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; + matrix_index++; + if (matrix_index == matrix_pool_size) { + matrix_index = 0; + } + } + double endtime = timer.getCpuTime(); + + const float timing = float(endtime - starttime); + + if (timing >= min_accurate_time) { + time_per_iter = timing / iters_at_a_time; + break; + } + + iters_at_a_time *= 2; + } + + delete[] lhs; + delete[] rhs; + delete[] dst; + + gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter; +} + +void print_cpuinfo() +{ +#ifdef __linux__ + cout << "contents of /proc/cpuinfo:" << endl; + string line; + ifstream cpuinfo("/proc/cpuinfo"); + if (cpuinfo.is_open()) { + while (getline(cpuinfo, line)) { + cout << line << endl; + } + cpuinfo.close(); + } + cout << endl; +#elif defined __APPLE__ + cout << "output of sysctl hw:" << endl; + system("sysctl hw"); + cout << endl; +#endif +} + +template <typename T> +string type_name() +{ + return "unknown"; +} + +template<> +string type_name<float>() +{ + return "float"; +} + +template<> +string type_name<double>() +{ + return "double"; +} + +struct action_t +{ + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run() const { abort(); } + virtual ~action_t() {} +}; + +void show_usage_and_exit(int /*argc*/, char* argv[], + const vector<unique_ptr<action_t>>& available_actions) +{ + cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl; + cerr << "available actions:" << endl << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << endl; + cerr << "options:" << endl << endl; + cerr << " --min-working-set-size=N:" << endl; + cerr << " Set the minimum working set size to N bytes." << endl; + cerr << " This is rounded up as needed to a multiple of matrix size." << endl; + cerr << " A larger working set lowers the chance of a warm cache." << endl; + cerr << " The default value 0 means use a large enough working" << endl; + cerr << " set to likely outsize caches." << endl; + cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl; + cerr << " avoid warm caches." << endl; + exit(1); +} + +float measure_clock_speed() +{ + cerr << "Measuring clock speed... \r" << flush; + + vector<float> all_gflops; + for (int i = 0; i < 8; i++) { + benchmark_t b(1024, 1024, 1024); + b.run(); + all_gflops.push_back(b.gflops); + } + + sort(all_gflops.begin(), all_gflops.end()); + float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5]; + + // multiply by an arbitrary constant to discourage trying doing anything with the + // returned values besides just comparing them with each other. + float result = stable_estimate * 123.456f; + + return result; +} + +struct human_duration_t +{ + int seconds; + human_duration_t(int s) : seconds(s) {} +}; + +ostream& operator<<(ostream& s, const human_duration_t& d) +{ + int remainder = d.seconds; + if (remainder > 3600) { + int hours = remainder / 3600; + s << hours << " h "; + remainder -= hours * 3600; + } + if (remainder > 60) { + int minutes = remainder / 60; + s << minutes << " min "; + remainder -= minutes * 60; + } + if (d.seconds < 600) { + s << remainder << " s"; + } + return s; +} + +const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"; + +void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) +{ + FILE* file = fopen(filename, "w"); + if (!file) { + cerr << "Could not open file " << filename << " for writing." << endl; + cerr << "Do you have write permissions on the current working directory?" << endl; + exit(1); + } + size_t benchmarks_vector_size = benchmarks.size(); + fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); + fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); + fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); + fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); + fclose(file); +} + +bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) +{ + FILE* file = fopen(filename, "r"); + if (!file) { + return false; + } + if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { + return false; + } + size_t benchmarks_vector_size = 0; + if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { + return false; + } + if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { + return false; + } + benchmarks.resize(benchmarks_vector_size); + if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { + return false; + } + unlink(filename); + return true; +} + +void try_run_some_benchmarks( + vector<benchmark_t>& benchmarks, + double time_start, + size_t& first_benchmark_to_run) +{ + if (first_benchmark_to_run == benchmarks.size()) { + return; + } + + double time_last_progress_update = 0; + double time_last_clock_speed_measurement = 0; + double time_now = 0; + + size_t benchmark_index = first_benchmark_to_run; + + while (true) { + float ratio_done = float(benchmark_index) / benchmarks.size(); + time_now = timer.getRealTime(); + + // We check clock speed every minute and at the end. + if (benchmark_index == benchmarks.size() || + time_now > time_last_clock_speed_measurement + 60.0f) + { + time_last_clock_speed_measurement = time_now; + + // Ensure that clock speed is as expected + float current_clock_speed = measure_clock_speed(); + + // The tolerance needs to be smaller than the relative difference between + // clock speeds that a device could operate under. + // It seems unlikely that a device would be throttling clock speeds by + // amounts smaller than 2%. + // With a value of 1%, I was getting within noise on a Sandy Bridge. + const float clock_speed_tolerance = 0.02f; + + if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { + // Clock speed is now higher than we previously measured. + // Either our initial measurement was inaccurate, which won't happen + // too many times as we are keeping the best clock speed value and + // and allowing some tolerance; or something really weird happened, + // which invalidates all benchmark results collected so far. + // Either way, we better restart all over again now. + if (benchmark_index) { + cerr << "Restarting at " << 100.0f * ratio_done + << " % because clock speed increased. " << endl; + } + max_clock_speed = current_clock_speed; + first_benchmark_to_run = 0; + return; + } + + bool rerun_last_tests = false; + + if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { + cerr << "Measurements completed so far: " + << 100.0f * ratio_done + << " % " << endl; + cerr << "Clock speed seems to be only " + << current_clock_speed/max_clock_speed + << " times what it used to be." << endl; + + unsigned int seconds_to_sleep_if_lower_clock_speed = 1; + + while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { + if (seconds_to_sleep_if_lower_clock_speed > 32) { + cerr << "Sleeping longer probably won't make a difference." << endl; + cerr << "Serializing benchmarks to " << session_filename << endl; + serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); + cerr << "Now restart this benchmark, and it should pick up where we left." << endl; + exit(2); + } + rerun_last_tests = true; + cerr << "Sleeping " + << seconds_to_sleep_if_lower_clock_speed + << " s... \r" << endl; + sleep(seconds_to_sleep_if_lower_clock_speed); + current_clock_speed = measure_clock_speed(); + seconds_to_sleep_if_lower_clock_speed *= 2; + } + } + + if (rerun_last_tests) { + cerr << "Redoing the last " + << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() + << " % because clock speed had been low. " << endl; + return; + } + + // nothing wrong with the clock speed so far, so there won't be a need to rerun + // benchmarks run so far in case we later encounter a lower clock speed. + first_benchmark_to_run = benchmark_index; + } + + if (benchmark_index == benchmarks.size()) { + // We're done! + first_benchmark_to_run = benchmarks.size(); + // Erase progress info + cerr << " " << endl; + return; + } + + // Display progress info on stderr + if (time_now > time_last_progress_update + 1.0f) { + time_last_progress_update = time_now; + cerr << "Measurements... " << 100.0f * ratio_done + << " %, ETA " + << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) + << " \r" << flush; + } + + // This is where we actually run a benchmark! + benchmarks[benchmark_index].run(); + benchmark_index++; + } +} + +void run_benchmarks(vector<benchmark_t>& benchmarks) +{ + size_t first_benchmark_to_run; + vector<benchmark_t> deserialized_benchmarks; + bool use_deserialized_benchmarks = false; + if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { + cerr << "Found serialized session with " + << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() + << " % already done" << endl; + if (deserialized_benchmarks.size() == benchmarks.size() && + first_benchmark_to_run > 0 && + first_benchmark_to_run < benchmarks.size()) + { + use_deserialized_benchmarks = true; + } + } + + if (use_deserialized_benchmarks) { + benchmarks = deserialized_benchmarks; + } else { + // not using deserialized benchmarks, starting from scratch + first_benchmark_to_run = 0; + + // Randomly shuffling benchmarks allows us to get accurate enough progress info, + // as now the cheap/expensive benchmarks are randomly mixed so they average out. + // It also means that if data is corrupted for some time span, the odds are that + // not all repetitions of a given benchmark will be corrupted. + random_shuffle(benchmarks.begin(), benchmarks.end()); + } + + for (int i = 0; i < 4; i++) { + max_clock_speed = max(max_clock_speed, measure_clock_speed()); + } + + double time_start = 0.0; + while (first_benchmark_to_run < benchmarks.size()) { + if (first_benchmark_to_run == 0) { + time_start = timer.getRealTime(); + } + try_run_some_benchmarks(benchmarks, + time_start, + first_benchmark_to_run); + } + + // Sort timings by increasing benchmark parameters, and decreasing gflops. + // The latter is very important. It means that we can ignore all but the first + // benchmark with given parameters. + sort(benchmarks.begin(), benchmarks.end()); + + // Collect best (i.e. now first) results for each parameter values. + vector<benchmark_t> best_benchmarks; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + if (best_benchmarks.empty() || + best_benchmarks.back().compact_product_size != it->compact_product_size || + best_benchmarks.back().compact_block_size != it->compact_block_size) + { + best_benchmarks.push_back(*it); + } + } + + // keep and return only the best benchmarks + benchmarks = best_benchmarks; +} + +struct measure_all_pot_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "all-pot-sizes"; } + virtual void run() const + { + vector<benchmark_t> benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { + for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { + for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { + benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); + } + } + } + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +struct measure_default_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "default-sizes"; } + virtual void run() const + { + vector<benchmark_t> benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + benchmarks.emplace_back(ksize, msize, nsize); + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +int main(int argc, char* argv[]) +{ + double time_start = timer.getRealTime(); + cout.precision(4); + cerr.precision(4); + + vector<unique_ptr<action_t>> available_actions; + available_actions.emplace_back(new measure_all_pot_sizes_action_t); + available_actions.emplace_back(new measure_default_sizes_action_t); + + auto action = available_actions.end(); + + if (argc <= 1) { + show_usage_and_exit(argc, argv, available_actions); + } + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[1], (*it)->invokation_name())) { + action = it; + break; + } + } + + if (action == available_actions.end()) { + show_usage_and_exit(argc, argv, available_actions); + } + + for (int i = 2; i < argc; i++) { + if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { + const char* equals_sign = strchr(argv[i], '='); + min_working_set_size = strtoul(equals_sign+1, nullptr, 10); + } else { + cerr << "unrecognized option: " << argv[i] << endl << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + + print_cpuinfo(); + + cout << "benchmark parameters:" << endl; + cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; + cout << "scalar type: " << type_name<Scalar>() << endl; + cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl; + cout << "minsize = " << minsize << endl; + cout << "maxsize = " << maxsize << endl; + cout << "measurement_repetitions = " << measurement_repetitions << endl; + cout << "min_accurate_time = " << min_accurate_time << endl; + cout << "min_working_set_size = " << min_working_set_size; + if (min_working_set_size == 0) { + cout << " (try to outsize caches)"; + } + cout << endl << endl; + + (*action)->run(); + + double time_end = timer.getRealTime(); + cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; +} diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index b299d9899..9444b450c 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -97,6 +97,7 @@ ENABLE_TESTING() add_subdirectory(libs/eigen3) add_subdirectory(libs/eigen2) +add_subdirectory(libs/tensors) add_subdirectory(libs/BLAS) add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) diff --git a/bench/btl/cmake/FindACML.cmake b/bench/btl/cmake/FindACML.cmake index f45ae1b0d..4989fa2f4 100644 --- a/bench/btl/cmake/FindACML.cmake +++ b/bench/btl/cmake/FindACML.cmake @@ -17,6 +17,7 @@ find_file(ACML_LIBRARIES libacml_mp.so PATHS /usr/lib + /usr/lib64 $ENV{ACMLDIR}/lib ${LIB_INSTALL_DIR} ) @@ -35,6 +36,7 @@ if(NOT ACML_LIBRARIES) libacml.so libacml_mv.so PATHS /usr/lib + /usr/lib64 $ENV{ACMLDIR}/lib ${LIB_INSTALL_DIR} ) diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake index 14b1dee09..4136a989d 100644 --- a/bench/btl/cmake/FindATLAS.cmake +++ b/bench/btl/cmake/FindATLAS.cmake @@ -3,18 +3,13 @@ if (ATLAS_LIBRARIES) set(ATLAS_FIND_QUIETLY TRUE) endif (ATLAS_LIBRARIES) -find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_LAPACK NAMES liblapack_atlas.so.3 liblapack.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LAPACK NAMES lapack_atlas lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -if(NOT ATLAS_LAPACK) - find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) - find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -endif(NOT ATLAS_LAPACK) - -find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) diff --git a/bench/btl/cmake/FindCBLAS.cmake b/bench/btl/cmake/FindCBLAS.cmake index 554f0291b..ce0f2f2b2 100644 --- a/bench/btl/cmake/FindCBLAS.cmake +++ b/bench/btl/cmake/FindCBLAS.cmake @@ -23,6 +23,7 @@ find_file(CBLAS_LIBRARIES libcblas.so.3 PATHS /usr/lib + /usr/lib64 $ENV{CBLASDIR}/lib ${LIB_INSTALL_DIR} ) diff --git a/bench/btl/cmake/FindOPENBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake index c76fc251c..2a0919436 100644 --- a/bench/btl/cmake/FindOPENBLAS.cmake +++ b/bench/btl/cmake/FindOPENBLAS.cmake @@ -3,7 +3,7 @@ if (OPENBLAS_LIBRARIES) set(OPENBLAS_FIND_QUIETLY TRUE) endif (OPENBLAS_LIBRARIES) -find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) +find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh index 47fe58135..1deabdae2 100644 --- a/bench/btl/libs/eigen2/eigen2_interface.hh +++ b/bench/btl/libs/eigen2/eigen2_interface.hh @@ -47,7 +47,7 @@ public : { #if defined(EIGEN_VECTORIZE_SSE) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; - #elif defined(EIGEN_VECTORIZE_ALTIVEC) + #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; #else if (SIZE==Dynamic) return "eigen2_novec"; else return "tiny_eigen2_novec"; diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt new file mode 100644 index 000000000..09d6d8e43 --- /dev/null +++ b/bench/btl/libs/tensors/CMakeLists.txt @@ -0,0 +1,44 @@ + + +if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR) + # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version + set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR}) + set(TENSOR_FOUND TRUE) +else() + find_package(Tensor) +endif() + +if (TENSOR_FOUND) + + include_directories(${TENSOR_INCLUDE_DIR}) + btl_add_bench(btl_tensor_linear main_linear.cpp) + btl_add_bench(btl_tensor_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + + option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) + if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) + btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp) + btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + endif() + + + if(NOT BTL_NOVEC) + btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF) + btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF) + btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF) + btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + + endif(NOT BTL_NOVEC) + +endif (TENSOR_FOUND) diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp new file mode 100644 index 000000000..e257f1e72 --- /dev/null +++ b/bench/btl/libs/tensors/main_linear.cpp @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp new file mode 100644 index 000000000..675fcfc6d --- /dev/null +++ b/bench/btl/libs/tensors/main_matmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp new file mode 100644 index 000000000..1af00c81b --- /dev/null +++ b/bench/btl/libs/tensors/main_vecmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh new file mode 100644 index 000000000..97b8e0f0b --- /dev/null +++ b/bench/btl/libs/tensors/tensor_interface.hh @@ -0,0 +1,105 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#ifndef TENSOR_INTERFACE_HH +#define TENSOR_INTERFACE_HH + +#include <unsupported/Eigen/CXX11/Tensor> +#include <vector> +#include "btl.hh" + +using namespace Eigen; + +template<class real> +class tensor_interface +{ +public : + typedef real real_type; + typedef typename Eigen::Tensor<real,2>::Index Index; + + typedef std::vector<real> stl_vector; + typedef std::vector<stl_vector> stl_matrix; + + typedef Eigen::Tensor<real,2> gene_matrix; + typedef Eigen::Tensor<real,1> gene_vector; + + + static inline std::string name( void ) + { + return EIGEN_MAKESTRING(BTL_PREFIX); + } + + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} + + static void free_vector(gene_vector & /*B*/) {} + + static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size())); + + for (unsigned int j=0; j<A_stl.size() ; j++){ + for (unsigned int i=0; i<A_stl[j].size() ; i++){ + A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i]; + } + } + } + + static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){ + B.resize(B_stl.size()); + + for (unsigned int i=0; i<B_stl.size() ; i++){ + B.coeffRef(i) = B_stl[i]; + } + } + + static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){ + for (unsigned int i=0; i<B_stl.size() ; i++){ + B_stl[i] = B.coeff(i); + } + } + + static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ + int N=A_stl.size(); + + for (int j=0;j<N;j++){ + A_stl[j].resize(N); + for (int i=0;i<N;i++){ + A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j)); + } + } + } + + static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){ + typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ + typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ + Y += X.constant(coef) * X; + } + + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ + Y = X.constant(a)*X + Y.constant(b)*Y; + } + + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ + cible = source; + } + + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ + cible = source; + } +}; + +#endif diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt new file mode 100644 index 000000000..40a71c781 --- /dev/null +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -0,0 +1,45 @@ +#3.0.1 +#3.1.1 +#3.2.0 +3.2.4 +#5745:37f59e65eb6c +5891:d8652709345d # introduce AVX +#5893:24b4dc92c6d3 # merge +5895:997c2ef9fc8b # introduce FMA +#5904:e1eafd14eaa1 # complex and AVX +5908:f8ee3c721251 # improve packing with ptranspose +#5921:ca808bb456b0 # merge +#5927:8b1001f9e3ac +5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks +#5949:f3488f4e45b2 # merge +#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec +#5992:4a429f5e0483 # merge +before-evaluators +#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products +#6639:c9121c60b5c7 +#6655:06f163b5221f # Properly detect FMA support on ARM +#6677:700e023044e7 # FMA has been wrongly disabled +#6681:11d31dafb0e3 +#6699:5e6e8e10aad1 # merge default to tensors +#6726:ff2d2388e7b9 # merge default to tensors +#6742:0cbd6195e829 # merge default to tensors +#6747:853d2bafeb8f # Generalized the gebp apis +6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation +#6781:9cc5a931b2c6 # generalized gemv +#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product +#6844:039efd86b75c # merge tensor +6845:7333ed40c6ef # change prefetching in gebp +#6856:b5be5e10eb7f # merge index conversion +#6893:c3a64aba7c70 # clean blocking size computation +#6898:6fb31ebe6492 # rotating kernel for ARM +6899:877facace746 # rotating kernel for ARM only +#6904:c250623ae9fa # result_of +6921:915f1b1fc158 # fix prefetching change for ARM +6923:9ff25f6dacc6 # prefetching +6933:52572e60b5d3 # blocking size strategy +6937:c8c042f286b2 # avoid redundant pack_rhs +6981:7e5d6f78da59 # dynamic loop swapping +6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. +7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) + diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp new file mode 100644 index 000000000..72eb9cab6 --- /dev/null +++ b/bench/perf_monitoring/gemm/gemm.cpp @@ -0,0 +1,67 @@ +#include <iostream> +#include <fstream> +#include <vector> +#include <Eigen/Core> +#include "../../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +typedef Matrix<Scalar,Dynamic,Dynamic> Mat; + +EIGEN_DONT_INLINE +void gemm(const Mat &A, const Mat &B, Mat &C) +{ + C.noalias() += A * B; +} + +EIGEN_DONT_INLINE +double bench(long m, long n, long k) +{ + Mat A(m,k); + Mat B(k,n); + Mat C(m,n); + A.setRandom(); + B.setRandom(); + C.setZero(); + + BenchTimer t; + + double up = 1e8*4/sizeof(Scalar); + double tm0 = 4, tm1 = 10; + if(NumTraits<Scalar>::IsComplex) + { + up /= 4; + tm0 = 2; + tm1 = 4; + } + + double flops = 2. * m * n * k; + long rep = std::max(1., std::min(100., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, gemm(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +int main(int argc, char **argv) +{ + std::vector<double> results; + + std::ifstream settings("settings.txt"); + long m, n, k; + while(settings >> m >> n >> k) + { + //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; + results.push_back( bench(m, n, k) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh new file mode 100755 index 000000000..609c471f9 --- /dev/null +++ b/bench/perf_monitoring/gemm/make_plot.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# base name of the bench +# it reads $1.out +# and generates $1.pdf +WHAT=$1 + +header="rev " +while read line +do + if [ ! -z '$line' ]; then + header="$header \"$line\"" + fi +done < settings.txt + +echo $header > $WHAT.out.header +cat $WHAT.out >> $WHAT.out.header + + +echo "set title '$WHAT'" > $WHAT.gnuplot +echo "set key autotitle columnhead outside " >> $WHAT.gnuplot +echo "set xtics rotate 1" >> $WHAT.gnuplot + +echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot +echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot + +col=`cat settings.txt | wc -l` +echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot +echo " " >> $WHAT.gnuplot + +gnuplot -persist < $WHAT.gnuplot + +# generate a png file +# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten .$WHAT.png + +# clean +rm $WHAT.out.header $WHAT.gnuplot
\ No newline at end of file diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh new file mode 100755 index 000000000..3fa6a3661 --- /dev/null +++ b/bench/perf_monitoring/gemm/run_gemm.sh @@ -0,0 +1,152 @@ +#!/bin/bash + +# Examples of environment variables to be set: +# PREFIX="haswell-fma-" +# CXX_FLAGS="-mfma" + +# Options: +# -up : enforce the recomputation of existing data, and keep best results as a merging strategy +# -s : recompute selected changesets only and keep bests + + +if echo "$*" | grep '\-up' > /dev/null; then + update=true +else + update=false +fi + +if echo "$*" | grep '\-s' > /dev/null; then + selected=true +else + selected=false +fi + +global_args="$*" + +if [ $selected == true ]; then + echo "Recompute selected changesets only and keep bests" +elif [ $update == true ]; then + echo "(Re-)Compute all changesets and keep bests" +else + echo "Skip previously computed changesets" +fi + + + +if [ ! -d "eigen_src" ]; then + hg clone https://bitbucket.org/eigen/eigen eigen_src +else + cd eigen_src + hg pull -u + cd .. +fi + +if [ ! -z '$CXX' ]; then + CXX=g++ +fi + +function make_backup +{ + if [ -f "$1.out" ]; then + mv "$1.out" "$1.backup" + fi +} + +function merge +{ + count1=`echo $1 | wc -w` + count2=`echo $2 | wc -w` + + if [ $count1 == $count2 ]; then + a=( $1 ); b=( $2 ) + res="" + for (( i=0 ; i<$count1 ; i++ )); do + ai=${a[$i]}; bi=${b[$i]} + tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l` + res="$res $tmp" + done + echo $res + + else + echo $1 + fi +} + +function test_current +{ + rev=$1 + scalar=$2 + name=$3 + + prev="" + if [ -e "$name.backup" ]; then + prev=`grep $rev "$name.backup" | cut -c 14-` + fi + res=$prev + count_rev=`echo $prev | wc -w` + count_ref=`cat "settings.txt" | wc -l` + if echo "$global_args" | grep "$rev" > /dev/null; then + rev_found=true + else + rev_found=false + fi +# echo $update et $selected et $rev_found because $rev et "$global_args" +# echo $count_rev et $count_ref + if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then + if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then + curr=`./$name` + if [ $count_rev == $count_ref ]; then + echo "merge previous $prev" + echo "with new $curr" + else + echo "got $curr" + fi + res=`merge "$curr" "$prev"` +# echo $res + echo "$rev $res" >> $name.out + else + echo "Compilation failed, skip rev $rev" + fi + else + echo "Skip existing results for $rev / $name" + echo "$rev $res" >> $name.out + fi +} + +make_backup $PREFIX"sgemm" +make_backup $PREFIX"dgemm" +make_backup $PREFIX"cgemm" + +cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev +do + if [ ! -z '$rev' ]; then + echo "Testing rev $rev" + cd eigen_src + hg up -C $rev > /dev/null + actual_rev=`hg identify | cut -f1 -d' '` + cd .. + + test_current $actual_rev float $PREFIX"sgemm" + test_current $actual_rev double $PREFIX"dgemm" + test_current $actual_rev "std::complex<double>" $PREFIX"cgemm" + fi + +done + +echo "Float:" +cat $PREFIX"sgemm.out" +echo "" + +echo "Double:" +cat $PREFIX"dgemm.out" +echo "" + +echo "Complex:" +cat $PREFIX"cgemm.out" +echo "" + +./make_plot.sh $PREFIX"sgemm" +./make_plot.sh $PREFIX"dgemm" +./make_plot.sh $PREFIX"cgemm" + + diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt new file mode 100644 index 000000000..5c43e1c7d --- /dev/null +++ b/bench/perf_monitoring/gemm/settings.txt @@ -0,0 +1,15 @@ +8 8 8 +9 9 9 +24 24 24 +239 239 239 +240 240 240 +2400 24 24 +24 2400 24 +24 24 2400 +24 2400 2400 +2400 24 2400 +2400 2400 24 +2400 2400 64 +4800 23 160 +23 4800 160 +2400 2400 2400 diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h new file mode 100644 index 000000000..525b9acda --- /dev/null +++ b/bench/tensors/tensor_benchmarks.h @@ -0,0 +1,305 @@ +#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ +#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ + +typedef int TensorIndex; +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "testing/base/public/benchmark.h" + +using Eigen::Tensor; +using Eigen::TensorMap; + + +// TODO(bsteiner): also templatize on the input type since we have users +// for int8 as well as floats. +template <typename Device> class BenchmarkSuite { + public: + BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) + : m_(m), k_(k), n_(n), device_(device) { + initialize(); + } + + BenchmarkSuite(const Device& device, size_t m) + : m_(m), k_(m), n_(m), device_(device) { + initialize(); + } + + ~BenchmarkSuite() { + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + } + + void memcpy(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void random(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = C.random(); + } + // Record the number of random numbers generated per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void slicing(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); + const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0)); + const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2)); + const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0)); + const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } + // Record the number of values copied from the rhs slice to the lhs slice + // each second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void shuffling(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(k_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + const Eigen::array<int, 2> shuffle(1, 0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } + // Record the number of values shuffled from A and copied to B each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void padding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_-3); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(k_, m_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; + paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); + paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.pad(paddings); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void striding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array<TensorIndex, 2> size_a(m_, k_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + + const Eigen::array<TensorIndex, 2> strides(1, 2); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.stride(strides); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void broadcasting(int num_iters) { + const Eigen::array<TensorIndex, 2> size_a(m_, 1); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); + const Eigen::array<TensorIndex, 2> size_c(m_, n_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c); + +#if defined(__CUDACC__) + // nvcc doesn't support cxx11 + const Eigen::array<int, 2> broadcast(1, n_); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1>, int> broadcast; + broadcast.set(1, n_); +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } + // Record the number of values broadcasted from A and copied to C each second + finalizeBenchmark(m_ * n_ * num_iters); + } + + void coeffWiseOp(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + } + // Record the number of FLOP executed per second (2 multiplications and + // 1 addition per value) + finalizeBenchmark(3 * m_ * m_ * num_iters); + } + + void algebraicFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + void transcendentalFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array<TensorIndex, 2> sizes(m_, m_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.exp() + B.log(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // Simple reduction + void reduction(int num_iters) { + const Eigen::array<TensorIndex, 2> input_size(k_, n_); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size); + const Eigen::array<TensorIndex, 1> output_size(n_); + TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size); + + const Eigen::array<TensorIndex, 1> sum_along_dim(0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // do a contraction which is equivalent to a matrix multiplication + void contraction(int num_iters) { + const Eigen::array<TensorIndex, 2> sizeA(m_, k_); + const Eigen::array<TensorIndex, 2> sizeB(k_, n_); + const Eigen::array<TensorIndex, 2> sizeC(m_, n_); + + const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<float, 2>::DimensionPair DimPair; + const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters); + } + + void convolution(int num_iters, int kernel_x, int kernel_y) { + const Eigen::array<TensorIndex, 2> input_sizes(m_, n_); + TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes); + const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y); + TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes); + const Eigen::array<TensorIndex, 2> result_sizes( + m_ - kernel_x + 1, n_ - kernel_y + 1); + TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes); + Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.convolve(B, dims); + } + // Record the number of FLOP executed per second (kernel_size + // multiplications and additions for each value in the resulting tensor) + finalizeBenchmark( + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + } + + private: + void initialize() { + a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); + b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); + c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(float)); + device_.memset(b_, 23, k_ * n_ * sizeof(float)); + device_.memset(c_, 31, m_ * n_ * sizeof(float)); + + BenchmarkUseRealTime(); + } + + inline void finalizeBenchmark(int64 num_items) { +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { + device_.synchronize(); + } +#endif + StopBenchmarkTiming(); + SetBenchmarkItemsProcessed(num_items); + } + + + size_t m_; + size_t k_; + size_t n_; + float* a_; + float* b_; + float* c_; + Device device_; +}; +#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc new file mode 100644 index 000000000..68653ba15 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -0,0 +1,156 @@ +#define EIGEN_USE_THREADS + +#include "base/sysinfo.h" +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" +#include "thread/threadpool.h" + +#ifdef __ANDROID__ +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPoolDevice device(threads); +#else +#define CREATE_THREAD_POOL(threads) \ +ThreadPool tp(threads); \ +tp.StartWorkers(); \ +Eigen::ThreadPoolDevice device(&tp, threads); +#endif + +// Simple functions +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + suite.FUNC(iters); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); + +BM_FuncCPU(memcpy, 4); +BM_FuncCPU(memcpy, 8); +BM_FuncCPU(memcpy, 12); + +BM_FuncCPU(random, 4); +BM_FuncCPU(random, 8); +BM_FuncCPU(random, 12); + +BM_FuncCPU(slicing, 4); +BM_FuncCPU(slicing, 8); +BM_FuncCPU(slicing, 12); + +BM_FuncCPU(shuffling, 4); +BM_FuncCPU(shuffling, 8); +BM_FuncCPU(shuffling, 12); + +BM_FuncCPU(padding, 4); +BM_FuncCPU(padding, 8); +BM_FuncCPU(padding, 12); + +BM_FuncCPU(striding, 4); +BM_FuncCPU(striding, 8); +BM_FuncCPU(striding, 12); + +BM_FuncCPU(broadcasting, 4); +BM_FuncCPU(broadcasting, 8); +BM_FuncCPU(broadcasting, 12); + +BM_FuncCPU(coeffWiseOp, 4); +BM_FuncCPU(coeffWiseOp, 8); +BM_FuncCPU(coeffWiseOp, 12); + +BM_FuncCPU(algebraicFunc, 4); +BM_FuncCPU(algebraicFunc, 8); +BM_FuncCPU(algebraicFunc, 12); + +BM_FuncCPU(transcendentalFunc, 4); +BM_FuncCPU(transcendentalFunc, 8); +BM_FuncCPU(transcendentalFunc, 12); + +BM_FuncCPU(reduction, 4); +BM_FuncCPU(reduction, 8); +BM_FuncCPU(reduction, 12); + + +// Contractions +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); + + +BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); + + +// Convolutions +#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ + static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); + +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); + +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); + +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); + +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc new file mode 100644 index 000000000..adea754ad --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -0,0 +1,75 @@ +#define EIGEN_USE_GPU + +#include <cuda.h> +#include <cuda_runtime.h> +#include <iostream> +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" + + + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); |