diff options
Diffstat (limited to 'bench')
-rw-r--r-- | bench/btl/generic_bench/bench_parameter.hh | 2 | ||||
-rw-r--r-- | bench/btl/generic_bench/btl.hh | 13 | ||||
-rw-r--r-- | bench/dense_solvers.cpp | 30 | ||||
-rw-r--r-- | bench/tensors/README | 12 | ||||
-rw-r--r-- | bench/tensors/benchmark.h | 49 | ||||
-rw-r--r-- | bench/tensors/benchmark_main.cc | 237 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 378 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_cpu.cc | 90 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_fp16_gpu.cu | 76 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_gpu.cu (renamed from bench/tensors/tensor_benchmarks_gpu.cc) | 32 |
10 files changed, 735 insertions, 184 deletions
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 0f62bd421..2b01149f9 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -29,7 +29,7 @@ // min vector size for axpy bench #define MIN_AXPY 5 // max vector size for axpy bench -#define MAX_AXPY 1000000 +#define MAX_AXPY 3000000 // min matrix size for matrix vector product bench #define MIN_MV 5 // max matrix size for matrix vector product bench diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index 92af1306a..706b00fb0 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -44,15 +44,10 @@ #define BTL_ASM_COMMENT(X) #endif -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && !defined(__arm__) && !defined(__powerpc__) -#define BTL_DISABLE_SSE_EXCEPTIONS() { \ - int aux = 0; \ - asm( \ - "stmxcsr %[aux] \n\t" \ - "orl $32832, %[aux] \n\t" \ - "ldmxcsr %[aux] \n\t" \ - : : [aux] "m" (aux)); \ -} +#ifdef __SSE__ +#include "xmmintrin.h" +// This enables flush to zero (FTZ) and denormals are zero (DAZ) modes: +#define BTL_DISABLE_SSE_EXCEPTIONS() { _mm_setcsr(_mm_getcsr() | 0x8040); } #else #define BTL_DISABLE_SSE_EXCEPTIONS() #endif diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp index f37a8bb5f..aa4ff011f 100644 --- a/bench/dense_solvers.cpp +++ b/bench/dense_solvers.cpp @@ -14,12 +14,12 @@ void bench(int id, int size = Size) Mat A(size,size); A.setRandom(); A = A*A.adjoint(); - BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd; + BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd; int tries = 3; int rep = 1000/size; if(rep==0) rep = 1; - rep = rep*rep; +// rep = rep*rep; LLT<Mat> llt(A); LDLT<Mat> ldlt(A); @@ -27,8 +27,10 @@ void bench(int id, int size = Size) FullPivLU<Mat> fplu(A); HouseholderQR<Mat> qr(A); ColPivHouseholderQR<Mat> cpqr(A); + CompleteOrthogonalDecomposition<Mat> cod(A); FullPivHouseholderQR<Mat> fpqr(A); JacobiSVD<Mat> jsvd(A.rows(),A.cols()); + BDCSVD<Mat> bdcsvd(A.rows(),A.cols()); BENCH(t_llt, tries, rep, llt.compute(A)); BENCH(t_ldlt, tries, rep, ldlt.compute(A)); @@ -36,9 +38,11 @@ void bench(int id, int size = Size) BENCH(t_fplu, tries, rep, fplu.compute(A)); BENCH(t_qr, tries, rep, qr.compute(A)); BENCH(t_cpqr, tries, rep, cpqr.compute(A)); + BENCH(t_cod, tries, rep, cod.compute(A)); BENCH(t_fpqr, tries, rep, fpqr.compute(A)); if(size<500) // JacobiSVD is really too slow for too large matrices BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV)); + BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV)); results["LLT"][id] = t_llt.best(); results["LDLT"][id] = t_ldlt.best(); @@ -46,8 +50,10 @@ void bench(int id, int size = Size) results["FullPivLU"][id] = t_fplu.best(); results["HouseholderQR"][id] = t_qr.best(); results["ColPivHouseholderQR"][id] = t_cpqr.best(); + results["CompleteOrthogonalDecomposition"][id] = t_cod.best(); results["FullPivHouseholderQR"][id] = t_fpqr.best(); results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0; + results["BDCSVD"][id] = t_bdcsvd.best(); } int main() @@ -64,13 +70,15 @@ int main() IOFormat fmt(3, 0, " \t", "\n", "", ""); - std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; - std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; - std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; + std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; + std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n"; } diff --git a/bench/tensors/README b/bench/tensors/README new file mode 100644 index 000000000..4398aa81b --- /dev/null +++ b/bench/tensors/README @@ -0,0 +1,12 @@ +Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. + +To compile the floating point CPU benchmarks, simply call: +g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu + +To compile the floating point GPU benchmarks, simply call: +nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu + + +To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. +nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu + diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h new file mode 100644 index 000000000..f115b54ad --- /dev/null +++ b/bench/tensors/benchmark.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <stddef.h> +#include <stdint.h> +#include <vector> + +namespace testing { +class Benchmark { + public: + Benchmark(const char* name, void (*fn)(int)) { + Register(name, fn, NULL); + } + Benchmark(const char* name, void (*fn_range)(int, int)) { + Register(name, NULL, fn_range); + } + Benchmark* Arg(int x); + Benchmark* Range(int lo, int hi); + const char* Name(); + bool ShouldRun(int argc, char* argv[]); + void Run(); + private: + const char* name_; + void (*fn_)(int); + void (*fn_range_)(int, int); + std::vector<int> args_; + void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)); + void RunRepeatedlyWithArg(int iterations, int arg); + void RunWithArg(int arg); +}; +} // namespace testing +void SetBenchmarkFlopsProcessed(int64_t); +void StopBenchmarkTiming(); +void StartBenchmarkTiming(); +#define BENCHMARK(f) \ + static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ + (new ::testing::Benchmark(#f, f)) diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc new file mode 100644 index 000000000..1efa0dbad --- /dev/null +++ b/bench/tensors/benchmark_main.cc @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "benchmark.h" +#include <regex.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <string> +#include <inttypes.h> +#include <time.h> +#include <map> + +static int64_t g_flops_processed; +static int64_t g_benchmark_total_time_ns; +static int64_t g_benchmark_start_time_ns; +typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap; +typedef BenchmarkMap::iterator BenchmarkMapIt; + +BenchmarkMap& gBenchmarks() { + static BenchmarkMap g_benchmarks; + return g_benchmarks; +} + +static int g_name_column_width = 20; + +static int Round(int n) { + int base = 1; + while (base*10 < n) { + base *= 10; + } + if (n < 2*base) { + return 2*base; + } + if (n < 5*base) { + return 5*base; + } + return 10*base; +} + +#ifdef __APPLE__ + #include <mach/mach_time.h> + static mach_timebase_info_data_t g_time_info; + static void __attribute__((constructor)) init_info() { + mach_timebase_info(&g_time_info); + } +#endif + +static int64_t NanoTime() { +#if defined(__APPLE__) + uint64_t t = mach_absolute_time(); + return t * g_time_info.numer / g_time_info.denom; +#else + struct timespec t; + t.tv_sec = t.tv_nsec = 0; + clock_gettime(CLOCK_MONOTONIC, &t); + return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec; +#endif +} + +namespace testing { +Benchmark* Benchmark::Arg(int arg) { + args_.push_back(arg); + return this; +} + +Benchmark* Benchmark::Range(int lo, int hi) { + const int kRangeMultiplier = 8; + if (hi < lo) { + int temp = hi; + hi = lo; + lo = temp; + } + while (lo < hi) { + args_.push_back(lo); + lo *= kRangeMultiplier; + } + // We always run the hi number. + args_.push_back(hi); + return this; +} + +const char* Benchmark::Name() { + return name_; +} +bool Benchmark::ShouldRun(int argc, char* argv[]) { + if (argc == 1) { + return true; // With no arguments, we run all benchmarks. + } + // Otherwise, we interpret each argument as a regular expression and + // see if any of our benchmarks match. + for (int i = 1; i < argc; i++) { + regex_t re; + if (regcomp(&re, argv[i], 0) != 0) { + fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]); + exit(EXIT_FAILURE); + } + int match = regexec(&re, name_, 0, NULL, 0); + regfree(&re); + if (match != REG_NOMATCH) { + return true; + } + } + return false; +} +void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) { + name_ = name; + fn_ = fn; + fn_range_ = fn_range; + if (fn_ == NULL && fn_range_ == NULL) { + fprintf(stderr, "%s: missing function\n", name_); + exit(EXIT_FAILURE); + } + gBenchmarks().insert(std::make_pair(name, this)); +} +void Benchmark::Run() { + if (fn_ != NULL) { + RunWithArg(0); + } else { + if (args_.empty()) { + fprintf(stderr, "%s: no args!\n", name_); + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < args_.size(); ++i) { + RunWithArg(args_[i]); + } + } +} +void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { + g_flops_processed = 0; + g_benchmark_total_time_ns = 0; + g_benchmark_start_time_ns = NanoTime(); + if (fn_ != NULL) { + fn_(iterations); + } else { + fn_range_(iterations, arg); + } + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } +} +void Benchmark::RunWithArg(int arg) { + // run once in case it's expensive + int iterations = 1; + RunRepeatedlyWithArg(iterations, arg); + while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) { + int last = iterations; + if (g_benchmark_total_time_ns/iterations == 0) { + iterations = 1e9; + } else { + iterations = 1e9 / (g_benchmark_total_time_ns/iterations); + } + iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last)); + iterations = Round(iterations); + RunRepeatedlyWithArg(iterations, arg); + } + char throughput[100]; + throughput[0] = '\0'; + if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) { + double mflops_processed = static_cast<double>(g_flops_processed)/1e6; + double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9; + snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds); + } + char full_name[100]; + if (fn_range_ != NULL) { + if (arg >= (1<<20)) { + snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20)); + } else if (arg >= (1<<10)) { + snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10)); + } else { + snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg); + } + } else { + snprintf(full_name, sizeof(full_name), "%s", name_); + } + printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name, + iterations, g_benchmark_total_time_ns/iterations, throughput); + fflush(stdout); +} +} // namespace testing +void SetBenchmarkFlopsProcessed(int64_t x) { + g_flops_processed = x; +} +void StopBenchmarkTiming() { + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } + g_benchmark_start_time_ns = 0; +} +void StartBenchmarkTiming() { + if (g_benchmark_start_time_ns == 0) { + g_benchmark_start_time_ns = NanoTime(); + } +} +int main(int argc, char* argv[]) { + if (gBenchmarks().empty()) { + fprintf(stderr, "No benchmarks registered!\n"); + exit(EXIT_FAILURE); + } + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { + int name_width = static_cast<int>(strlen(it->second->Name())); + g_name_column_width = std::max(g_name_column_width, name_width); + } + bool need_header = true; + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { + ::testing::Benchmark* b = it->second; + if (b->ShouldRun(argc, argv)) { + if (need_header) { + printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op"); + fflush(stdout); + need_header = false; + } + b->Run(); + } + } + if (need_header) { + fprintf(stderr, "No matching benchmarks!\n"); + fprintf(stderr, "Available benchmarks:\n"); + for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) { + fprintf(stderr, " %s\n", it->second->Name()); + } + exit(EXIT_FAILURE); + } + return 0; +} diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 525b9acda..90b9bc741 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -4,16 +4,18 @@ typedef int TensorIndex; #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "testing/base/public/benchmark.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "benchmark.h" + +#define BENCHMARK_RANGE(bench, lo, hi) \ + BENCHMARK(bench)->Range(lo, hi) using Eigen::Tensor; using Eigen::TensorMap; - // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. -template <typename Device> class BenchmarkSuite { +template <typename Device, typename T> class BenchmarkSuite { public: BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) : m_(m), k_(k), n_(n), device_(device) { @@ -35,37 +37,62 @@ template <typename Device> class BenchmarkSuite { eigen_assert(m_ == k_ && k_ == n_); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); } // Record the number of values copied per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); + } + + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + Eigen::array<TensorIndex, 2> sizes; + if (sizeof(T) >= sizeof(int)) { + sizes[0] = m_; + sizes[1] = k_; + } else { + sizes[0] = m_ * sizeof(T) / sizeof(int); + sizes[1] = k_ * sizeof(T) / sizeof(int); + } + const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes); + TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.template cast<T>(); + } + // Record the number of values copied per second + finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); } void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes(m_, m_); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); } // Record the number of random numbers generated per second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); } void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes(m_, m_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); - - const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); - const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0)); - const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2)); - const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0)); - const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2)); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); + + const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2); + const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0); + const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2); + const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0); + const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -80,32 +107,76 @@ template <typename Device> class BenchmarkSuite { } // Record the number of values copied from the rhs slice to the lhs slice // each second - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); + } + + void rowChip(int num_iters) { + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = n_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); + } + + void colChip(int num_iters) { + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = n_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); } void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array<TensorIndex, 2> size_a(m_, k_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b(k_, m_); - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); - - const Eigen::array<int, 2> shuffle(1, 0); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = k_; + size_b[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<int, 2> shuffle; + shuffle[0] = 1; + shuffle[1] = 0; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); } // Record the number of values shuffled from A and copied to B each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); } void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array<TensorIndex, 2> size_a(m_, k_-3); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b(k_, m_); - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_-3; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = k_; + size_b[1] = m_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); @@ -116,35 +187,46 @@ template <typename Device> class BenchmarkSuite { B.device(device_) = A.pad(paddings); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); } void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array<TensorIndex, 2> size_a(m_, k_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2); - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b); - - const Eigen::array<TensorIndex, 2> strides(1, 2); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = k_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_b; + size_b[0] = m_; + size_b[1] = k_/2; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); + + Eigen::array<TensorIndex, 2> strides; + strides[0] = 1; + strides[1] = 2; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); } // Record the number of values copied from the padded tensor A each second - finalizeBenchmark(m_ * k_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); } void broadcasting(int num_iters) { - const Eigen::array<TensorIndex, 2> size_a(m_, 1); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a); - const Eigen::array<TensorIndex, 2> size_c(m_, n_); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c); - -#if defined(__CUDACC__) - // nvcc doesn't support cxx11 - const Eigen::array<int, 2> broadcast(1, n_); + Eigen::array<TensorIndex, 2> size_a; + size_a[0] = m_; + size_a[1] = 1; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); + Eigen::array<TensorIndex, 2> size_c; + size_c[0] = m_; + size_c[1] = n_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c); + +#ifndef EIGEN_HAS_INDEX_LIST + Eigen::array<int, 2> broadcast; + broadcast[0] = 1; + broadcast[1] = n_; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -157,31 +239,35 @@ template <typename Device> class BenchmarkSuite { C.device(device_) = A.broadcast(broadcast); } // Record the number of values broadcasted from A and copied to C each second - finalizeBenchmark(m_ * n_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters); } void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes(m_, m_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); } // Record the number of FLOP executed per second (2 multiplications and // 1 addition per value) - finalizeBenchmark(3 * m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters); } void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes(m_, m_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -189,15 +275,17 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); } void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array<TensorIndex, 2> sizes(m_, m_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes); + Eigen::array<TensorIndex, 2> sizes; + sizes[0] = m_; + sizes[1] = m_; + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -205,17 +293,57 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { - const Eigen::array<TensorIndex, 2> input_size(k_, n_); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size); - const Eigen::array<TensorIndex, 1> output_size(n_); - TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size); + // Row reduction + void rowReduction(int num_iters) { + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = n_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + Eigen::array<TensorIndex, 1> sum_along_dim; + sum_along_dim[0] = 0; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<0>> sum_along_dim; +#endif - const Eigen::array<TensorIndex, 1> sum_along_dim(0); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( + b_, input_size); + Eigen::array<TensorIndex, 1> output_size; + output_size[0] = k_; + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + Eigen::array<TensorIndex, 1> sum_along_dim; + sum_along_dim[0] = 1; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1>> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -223,21 +351,48 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); } - // do a contraction which is equivalent to a matrix multiplication - void contraction(int num_iters) { - const Eigen::array<TensorIndex, 2> sizeA(m_, k_); - const Eigen::array<TensorIndex, 2> sizeB(k_, n_); - const Eigen::array<TensorIndex, 2> sizeC(m_, n_); + // Full reduction + void fullReduction(int num_iters) { + Eigen::array<TensorIndex, 2> input_size; + input_size[0] = k_; + input_size[1] = n_; + const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( + b_, input_size); + Eigen::array<TensorIndex, 0> output_size; + TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C( + c_, output_size); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA); - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); + } - typedef typename Tensor<float, 2>::DimensionPair DimPair; - const Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + // do a contraction which is equivalent to a matrix multiplication + void contraction(int num_iters) { + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -245,18 +400,25 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (size_ multiplications and // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters); + finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters); } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array<TensorIndex, 2> input_sizes(m_, n_); - TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y); - TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array<TensorIndex, 2> result_sizes( - m_ - kernel_x + 1, n_ - kernel_y + 1); - TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes); - Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1); + Eigen::array<TensorIndex, 2> input_sizes; + input_sizes[0] = m_; + input_sizes[1] = n_; + TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes); + Eigen::array<TensorIndex, 2> kernel_sizes; + kernel_sizes[0] = kernel_x; + kernel_sizes[1] = kernel_y; + TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes); + Eigen::array<TensorIndex, 2> result_sizes; + result_sizes[0] = m_ - kernel_x + 1; + result_sizes[1] = n_ - kernel_y + 1; + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes); + Eigen::array<TensorIndex, 2> dims; + dims[0] = 0; + dims[1] = 1; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -264,42 +426,42 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) - finalizeBenchmark( - (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + finalizeBenchmark(static_cast<int64_t>(2) * + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: void initialize() { - a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); - b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); - c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(float)); - device_.memset(b_, 23, k_ * n_ * sizeof(float)); - device_.memset(c_, 31, m_ * n_ * sizeof(float)); + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); - BenchmarkUseRealTime(); + //BenchmarkUseRealTime(); } - inline void finalizeBenchmark(int64 num_items) { + inline void finalizeBenchmark(int64_t num_items) { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { device_.synchronize(); } #endif StopBenchmarkTiming(); - SetBenchmarkItemsProcessed(num_items); + SetBenchmarkFlopsProcessed(num_items); } - size_t m_; - size_t k_; - size_t n_; - float* a_; - float* b_; - float* c_; + TensorIndex m_; + TensorIndex k_; + TensorIndex n_; + T* a_; + T* b_; + T* c_; Device device_; }; #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 68653ba15..8947f4b7f 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -1,35 +1,31 @@ #define EIGEN_USE_THREADS -#include "base/sysinfo.h" -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" -#include "thread/threadpool.h" +#include <string> + +#include "tensor_benchmarks.h" -#ifdef __ANDROID__ -#define CREATE_THREAD_POOL(threads) \ -Eigen::ThreadPoolDevice device(threads); -#else #define CREATE_THREAD_POOL(threads) \ -ThreadPool tp(threads); \ -tp.StartWorkers(); \ -Eigen::ThreadPoolDevice device(&tp, threads); -#endif +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions -#define BM_FuncCPU(FUNC, THREADS) \ - static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ - StopBenchmarkTiming(); \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ - suite.FUNC(iters); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ - } \ +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); BM_FuncCPU(memcpy, 4); BM_FuncCPU(memcpy, 8); BM_FuncCPU(memcpy, 12); +BM_FuncCPU(typeCasting, 4); +BM_FuncCPU(typeCasting, 8); +BM_FuncCPU(typeCasting, 12); + BM_FuncCPU(random, 4); BM_FuncCPU(random, 8); BM_FuncCPU(random, 12); @@ -38,6 +34,14 @@ BM_FuncCPU(slicing, 4); BM_FuncCPU(slicing, 8); BM_FuncCPU(slicing, 12); +BM_FuncCPU(rowChip, 4); +BM_FuncCPU(rowChip, 8); +BM_FuncCPU(rowChip, 12); + +BM_FuncCPU(colChip, 4); +BM_FuncCPU(colChip, 8); +BM_FuncCPU(colChip, 12); + BM_FuncCPU(shuffling, 4); BM_FuncCPU(shuffling, 8); BM_FuncCPU(shuffling, 12); @@ -66,26 +70,29 @@ BM_FuncCPU(transcendentalFunc, 4); BM_FuncCPU(transcendentalFunc, 8); BM_FuncCPU(transcendentalFunc, 12); -BM_FuncCPU(reduction, 4); -BM_FuncCPU(reduction, 8); -BM_FuncCPU(reduction, 12); +BM_FuncCPU(rowReduction, 4); +BM_FuncCPU(rowReduction, 8); +BM_FuncCPU(rowReduction, 12); + +BM_FuncCPU(colReduction, 4); +BM_FuncCPU(colReduction, 8); +BM_FuncCPU(colReduction, 12); // Contractions -#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ - static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ - StopBenchmarkTiming(); \ - if (THREADS == 1) { \ - Eigen::DefaultDevice device; \ - BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } else { \ - CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ - } \ +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -107,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); + BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); @@ -125,9 +138,8 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ StopBenchmarkTiming(); \ CREATE_THREAD_POOL(THREADS); \ - BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu new file mode 100644 index 000000000..d34bd73ca --- /dev/null +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -0,0 +1,76 @@ +#define EIGEN_USE_GPU + +#include <cuda.h> +#include <cuda_runtime.h> +#include <iostream> + +#include "tensor_benchmarks.h" + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); +//BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +//BM_FuncGPU(algebraicFunc); +//BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +/*BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); +*/ + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::CudaStreamDevice stream; \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +/* +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +*/ diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu index adea754ad..76d68c5c1 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -3,47 +3,48 @@ #include <cuda.h> #include <cuda_runtime.h> #include <iostream> -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" - +#include "tensor_benchmarks.h" // Simple functions #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); BM_FuncGPU(shuffling); BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); // Contractions #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); @@ -51,19 +52,18 @@ BM_FuncGPU(reduction); BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); // Convolutions #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - cudaStream_t stream; \ - cudaStreamCreate(&stream); \ + Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \ + BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ - cudaStreamDestroy(stream); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); |