aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench
diff options
context:
space:
mode:
Diffstat (limited to 'bench')
-rw-r--r--bench/btl/generic_bench/bench_parameter.hh2
-rw-r--r--bench/btl/generic_bench/btl.hh13
-rw-r--r--bench/dense_solvers.cpp30
-rw-r--r--bench/tensors/README12
-rw-r--r--bench/tensors/benchmark.h49
-rw-r--r--bench/tensors/benchmark_main.cc237
-rw-r--r--bench/tensors/tensor_benchmarks.h378
-rw-r--r--bench/tensors/tensor_benchmarks_cpu.cc90
-rw-r--r--bench/tensors/tensor_benchmarks_fp16_gpu.cu76
-rw-r--r--bench/tensors/tensor_benchmarks_gpu.cu (renamed from bench/tensors/tensor_benchmarks_gpu.cc)32
10 files changed, 735 insertions, 184 deletions
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index 0f62bd421..2b01149f9 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -29,7 +29,7 @@
// min vector size for axpy bench
#define MIN_AXPY 5
// max vector size for axpy bench
-#define MAX_AXPY 1000000
+#define MAX_AXPY 3000000
// min matrix size for matrix vector product bench
#define MIN_MV 5
// max matrix size for matrix vector product bench
diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh
index 92af1306a..706b00fb0 100644
--- a/bench/btl/generic_bench/btl.hh
+++ b/bench/btl/generic_bench/btl.hh
@@ -44,15 +44,10 @@
#define BTL_ASM_COMMENT(X)
#endif
-#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && !defined(__arm__) && !defined(__powerpc__)
-#define BTL_DISABLE_SSE_EXCEPTIONS() { \
- int aux = 0; \
- asm( \
- "stmxcsr %[aux] \n\t" \
- "orl $32832, %[aux] \n\t" \
- "ldmxcsr %[aux] \n\t" \
- : : [aux] "m" (aux)); \
-}
+#ifdef __SSE__
+#include "xmmintrin.h"
+// This enables flush to zero (FTZ) and denormals are zero (DAZ) modes:
+#define BTL_DISABLE_SSE_EXCEPTIONS() { _mm_setcsr(_mm_getcsr() | 0x8040); }
#else
#define BTL_DISABLE_SSE_EXCEPTIONS()
#endif
diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp
index f37a8bb5f..aa4ff011f 100644
--- a/bench/dense_solvers.cpp
+++ b/bench/dense_solvers.cpp
@@ -14,12 +14,12 @@ void bench(int id, int size = Size)
Mat A(size,size);
A.setRandom();
A = A*A.adjoint();
- BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd;
+ BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd;
int tries = 3;
int rep = 1000/size;
if(rep==0) rep = 1;
- rep = rep*rep;
+// rep = rep*rep;
LLT<Mat> llt(A);
LDLT<Mat> ldlt(A);
@@ -27,8 +27,10 @@ void bench(int id, int size = Size)
FullPivLU<Mat> fplu(A);
HouseholderQR<Mat> qr(A);
ColPivHouseholderQR<Mat> cpqr(A);
+ CompleteOrthogonalDecomposition<Mat> cod(A);
FullPivHouseholderQR<Mat> fpqr(A);
JacobiSVD<Mat> jsvd(A.rows(),A.cols());
+ BDCSVD<Mat> bdcsvd(A.rows(),A.cols());
BENCH(t_llt, tries, rep, llt.compute(A));
BENCH(t_ldlt, tries, rep, ldlt.compute(A));
@@ -36,9 +38,11 @@ void bench(int id, int size = Size)
BENCH(t_fplu, tries, rep, fplu.compute(A));
BENCH(t_qr, tries, rep, qr.compute(A));
BENCH(t_cpqr, tries, rep, cpqr.compute(A));
+ BENCH(t_cod, tries, rep, cod.compute(A));
BENCH(t_fpqr, tries, rep, fpqr.compute(A));
if(size<500) // JacobiSVD is really too slow for too large matrices
BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
+ BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV));
results["LLT"][id] = t_llt.best();
results["LDLT"][id] = t_ldlt.best();
@@ -46,8 +50,10 @@ void bench(int id, int size = Size)
results["FullPivLU"][id] = t_fplu.best();
results["HouseholderQR"][id] = t_qr.best();
results["ColPivHouseholderQR"][id] = t_cpqr.best();
+ results["CompleteOrthogonalDecomposition"][id] = t_cod.best();
results["FullPivHouseholderQR"][id] = t_fpqr.best();
results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
+ results["BDCSVD"][id] = t_bdcsvd.best();
}
int main()
@@ -64,13 +70,15 @@ int main()
IOFormat fmt(3, 0, " \t", "\n", "", "");
- std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
- std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n";
- std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
- std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
+ std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n";
+ std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+ std::cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
}
diff --git a/bench/tensors/README b/bench/tensors/README
new file mode 100644
index 000000000..4398aa81b
--- /dev/null
+++ b/bench/tensors/README
@@ -0,0 +1,12 @@
+Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+
+To compile the floating point CPU benchmarks, simply call:
+g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the floating point GPU benchmarks, simply call:
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu
+
+
+To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
+nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu
+
diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h
new file mode 100644
index 000000000..f115b54ad
--- /dev/null
+++ b/bench/tensors/benchmark.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+ Benchmark(const char* name, void (*fn)(int)) {
+ Register(name, fn, NULL);
+ }
+ Benchmark(const char* name, void (*fn_range)(int, int)) {
+ Register(name, NULL, fn_range);
+ }
+ Benchmark* Arg(int x);
+ Benchmark* Range(int lo, int hi);
+ const char* Name();
+ bool ShouldRun(int argc, char* argv[]);
+ void Run();
+ private:
+ const char* name_;
+ void (*fn_)(int);
+ void (*fn_range_)(int, int);
+ std::vector<int> args_;
+ void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+ void RunRepeatedlyWithArg(int iterations, int arg);
+ void RunWithArg(int arg);
+};
+} // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+ (new ::testing::Benchmark(#f, f))
diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc
new file mode 100644
index 000000000..1efa0dbad
--- /dev/null
+++ b/bench/tensors/benchmark_main.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_flops_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+
+BenchmarkMap& gBenchmarks() {
+ static BenchmarkMap g_benchmarks;
+ return g_benchmarks;
+}
+
+static int g_name_column_width = 20;
+
+static int Round(int n) {
+ int base = 1;
+ while (base*10 < n) {
+ base *= 10;
+ }
+ if (n < 2*base) {
+ return 2*base;
+ }
+ if (n < 5*base) {
+ return 5*base;
+ }
+ return 10*base;
+}
+
+#ifdef __APPLE__
+ #include <mach/mach_time.h>
+ static mach_timebase_info_data_t g_time_info;
+ static void __attribute__((constructor)) init_info() {
+ mach_timebase_info(&g_time_info);
+ }
+#endif
+
+static int64_t NanoTime() {
+#if defined(__APPLE__)
+ uint64_t t = mach_absolute_time();
+ return t * g_time_info.numer / g_time_info.denom;
+#else
+ struct timespec t;
+ t.tv_sec = t.tv_nsec = 0;
+ clock_gettime(CLOCK_MONOTONIC, &t);
+ return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+#endif
+}
+
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+ args_.push_back(arg);
+ return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+ const int kRangeMultiplier = 8;
+ if (hi < lo) {
+ int temp = hi;
+ hi = lo;
+ lo = temp;
+ }
+ while (lo < hi) {
+ args_.push_back(lo);
+ lo *= kRangeMultiplier;
+ }
+ // We always run the hi number.
+ args_.push_back(hi);
+ return this;
+}
+
+const char* Benchmark::Name() {
+ return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+ if (argc == 1) {
+ return true; // With no arguments, we run all benchmarks.
+ }
+ // Otherwise, we interpret each argument as a regular expression and
+ // see if any of our benchmarks match.
+ for (int i = 1; i < argc; i++) {
+ regex_t re;
+ if (regcomp(&re, argv[i], 0) != 0) {
+ fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+ exit(EXIT_FAILURE);
+ }
+ int match = regexec(&re, name_, 0, NULL, 0);
+ regfree(&re);
+ if (match != REG_NOMATCH) {
+ return true;
+ }
+ }
+ return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+ name_ = name;
+ fn_ = fn;
+ fn_range_ = fn_range;
+ if (fn_ == NULL && fn_range_ == NULL) {
+ fprintf(stderr, "%s: missing function\n", name_);
+ exit(EXIT_FAILURE);
+ }
+ gBenchmarks().insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+ if (fn_ != NULL) {
+ RunWithArg(0);
+ } else {
+ if (args_.empty()) {
+ fprintf(stderr, "%s: no args!\n", name_);
+ exit(EXIT_FAILURE);
+ }
+ for (size_t i = 0; i < args_.size(); ++i) {
+ RunWithArg(args_[i]);
+ }
+ }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+ g_flops_processed = 0;
+ g_benchmark_total_time_ns = 0;
+ g_benchmark_start_time_ns = NanoTime();
+ if (fn_ != NULL) {
+ fn_(iterations);
+ } else {
+ fn_range_(iterations, arg);
+ }
+ if (g_benchmark_start_time_ns != 0) {
+ g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+ }
+}
+void Benchmark::RunWithArg(int arg) {
+ // run once in case it's expensive
+ int iterations = 1;
+ RunRepeatedlyWithArg(iterations, arg);
+ while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+ int last = iterations;
+ if (g_benchmark_total_time_ns/iterations == 0) {
+ iterations = 1e9;
+ } else {
+ iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+ }
+ iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+ iterations = Round(iterations);
+ RunRepeatedlyWithArg(iterations, arg);
+ }
+ char throughput[100];
+ throughput[0] = '\0';
+ if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
+ double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
+ double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+ snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
+ }
+ char full_name[100];
+ if (fn_range_ != NULL) {
+ if (arg >= (1<<20)) {
+ snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+ } else if (arg >= (1<<10)) {
+ snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+ } else {
+ snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+ }
+ } else {
+ snprintf(full_name, sizeof(full_name), "%s", name_);
+ }
+ printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+ iterations, g_benchmark_total_time_ns/iterations, throughput);
+ fflush(stdout);
+}
+} // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t x) {
+ g_flops_processed = x;
+}
+void StopBenchmarkTiming() {
+ if (g_benchmark_start_time_ns != 0) {
+ g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+ }
+ g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+ if (g_benchmark_start_time_ns == 0) {
+ g_benchmark_start_time_ns = NanoTime();
+ }
+}
+int main(int argc, char* argv[]) {
+ if (gBenchmarks().empty()) {
+ fprintf(stderr, "No benchmarks registered!\n");
+ exit(EXIT_FAILURE);
+ }
+ for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+ int name_width = static_cast<int>(strlen(it->second->Name()));
+ g_name_column_width = std::max(g_name_column_width, name_width);
+ }
+ bool need_header = true;
+ for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+ ::testing::Benchmark* b = it->second;
+ if (b->ShouldRun(argc, argv)) {
+ if (need_header) {
+ printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+ fflush(stdout);
+ need_header = false;
+ }
+ b->Run();
+ }
+ }
+ if (need_header) {
+ fprintf(stderr, "No matching benchmarks!\n");
+ fprintf(stderr, "Available benchmarks:\n");
+ for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+ fprintf(stderr, " %s\n", it->second->Name());
+ }
+ exit(EXIT_FAILURE);
+ }
+ return 0;
+}
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 525b9acda..90b9bc741 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -4,16 +4,18 @@
typedef int TensorIndex;
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "testing/base/public/benchmark.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+ BENCHMARK(bench)->Range(lo, hi)
using Eigen::Tensor;
using Eigen::TensorMap;
-
// TODO(bsteiner): also templatize on the input type since we have users
// for int8 as well as floats.
-template <typename Device> class BenchmarkSuite {
+template <typename Device, typename T> class BenchmarkSuite {
public:
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
: m_(m), k_(k), n_(n), device_(device) {
@@ -35,37 +37,62 @@ template <typename Device> class BenchmarkSuite {
eigen_assert(m_ == k_ && k_ == n_);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
- device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
+ device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
}
// Record the number of values copied per second
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+ }
+
+ void typeCasting(int num_iters) {
+ eigen_assert(m_ == n_);
+ Eigen::array<TensorIndex, 2> sizes;
+ if (sizeof(T) >= sizeof(int)) {
+ sizes[0] = m_;
+ sizes[1] = k_;
+ } else {
+ sizes[0] = m_ * sizeof(T) / sizeof(int);
+ sizes[1] = k_ * sizeof(T) / sizeof(int);
+ }
+ const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
+ TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ B.device(device_) = A.template cast<T>();
+ }
+ // Record the number of values copied per second
+ finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+ Eigen::array<TensorIndex, 2> sizes;
+ sizes[0] = m_;
+ sizes[1] = m_;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = C.random();
}
// Record the number of random numbers generated per second
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void slicing(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
-
- const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
- const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
- const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
- const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
- const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+ Eigen::array<TensorIndex, 2> sizes;
+ sizes[0] = m_;
+ sizes[1] = m_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+
+ const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+ const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+ const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+ const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+ const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -80,32 +107,76 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of values copied from the rhs slice to the lhs slice
// each second
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+ }
+
+ void rowChip(int num_iters) {
+ Eigen::array<TensorIndex, 2> input_size;
+ input_size[0] = k_;
+ input_size[1] = n_;
+ const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+ Eigen::array<TensorIndex, 1> output_size;
+ output_size[0] = n_;
+ TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.chip(iter % k_, 0);
+ }
+ // Record the number of values copied from the rhs chip to the lhs.
+ finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+ }
+
+ void colChip(int num_iters) {
+ Eigen::array<TensorIndex, 2> input_size;
+ input_size[0] = k_;
+ input_size[1] = n_;
+ const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+ Eigen::array<TensorIndex, 1> output_size;
+ output_size[0] = n_;
+ TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.chip(iter % n_, 1);
+ }
+ // Record the number of values copied from the rhs chip to the lhs.
+ finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
}
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(k_, m_);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-
- const Eigen::array<int, 2> shuffle(1, 0);
+ Eigen::array<TensorIndex, 2> size_a;
+ size_a[0] = m_;
+ size_a[1] = k_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+ Eigen::array<TensorIndex, 2> size_b;
+ size_b[0] = k_;
+ size_b[1] = m_;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+ Eigen::array<int, 2> shuffle;
+ shuffle[0] = 1;
+ shuffle[1] = 0;
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.shuffle(shuffle);
}
// Record the number of values shuffled from A and copied to B each second
- finalizeBenchmark(m_ * k_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void padding(int num_iters) {
eigen_assert(m_ == k_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(k_, m_);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+ Eigen::array<TensorIndex, 2> size_a;
+ size_a[0] = m_;
+ size_a[1] = k_-3;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+ Eigen::array<TensorIndex, 2> size_b;
+ size_b[0] = k_;
+ size_b[1] = m_;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
@@ -116,35 +187,46 @@ template <typename Device> class BenchmarkSuite {
B.device(device_) = A.pad(paddings);
}
// Record the number of values copied from the padded tensor A each second
- finalizeBenchmark(m_ * k_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void striding(int num_iters) {
eigen_assert(m_ == k_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
-
- const Eigen::array<TensorIndex, 2> strides(1, 2);
+ Eigen::array<TensorIndex, 2> size_a;
+ size_a[0] = m_;
+ size_a[1] = k_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+ Eigen::array<TensorIndex, 2> size_b;
+ size_b[0] = m_;
+ size_b[1] = k_/2;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
+
+ Eigen::array<TensorIndex, 2> strides;
+ strides[0] = 1;
+ strides[1] = 2;
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.stride(strides);
}
// Record the number of values copied from the padded tensor A each second
- finalizeBenchmark(m_ * k_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
}
void broadcasting(int num_iters) {
- const Eigen::array<TensorIndex, 2> size_a(m_, 1);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_c(m_, n_);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
-
-#if defined(__CUDACC__)
- // nvcc doesn't support cxx11
- const Eigen::array<int, 2> broadcast(1, n_);
+ Eigen::array<TensorIndex, 2> size_a;
+ size_a[0] = m_;
+ size_a[1] = 1;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
+ Eigen::array<TensorIndex, 2> size_c;
+ size_c[0] = m_;
+ size_c[1] = n_;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+ Eigen::array<int, 2> broadcast;
+ broadcast[0] = 1;
+ broadcast[1] = n_;
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
@@ -157,31 +239,35 @@ template <typename Device> class BenchmarkSuite {
C.device(device_) = A.broadcast(broadcast);
}
// Record the number of values broadcasted from A and copied to C each second
- finalizeBenchmark(m_ * n_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
}
void coeffWiseOp(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+ Eigen::array<TensorIndex, 2> sizes;
+ sizes[0] = m_;
+ sizes[1] = m_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
- C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
+ C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
}
// Record the number of FLOP executed per second (2 multiplications and
// 1 addition per value)
- finalizeBenchmark(3 * m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
}
void algebraicFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+ Eigen::array<TensorIndex, 2> sizes;
+ sizes[0] = m_;
+ sizes[1] = m_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -189,15 +275,17 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
void transcendentalFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+ Eigen::array<TensorIndex, 2> sizes;
+ sizes[0] = m_;
+ sizes[1] = m_;
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -205,17 +293,57 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
}
- // Simple reduction
- void reduction(int num_iters) {
- const Eigen::array<TensorIndex, 2> input_size(k_, n_);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
- const Eigen::array<TensorIndex, 1> output_size(n_);
- TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+ // Row reduction
+ void rowReduction(int num_iters) {
+ Eigen::array<TensorIndex, 2> input_size;
+ input_size[0] = k_;
+ input_size[1] = n_;
+ const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+ Eigen::array<TensorIndex, 1> output_size;
+ output_size[0] = n_;
+ TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+ Eigen::array<TensorIndex, 1> sum_along_dim;
+ sum_along_dim[0] = 0;
+#else
+ // Take advantage of cxx11 to give the compiler information it can use to
+ // optimize the code.
+ Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif
- const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.sum(sum_along_dim);
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+ }
+
+ // Column reduction
+ void colReduction(int num_iters) {
+ Eigen::array<TensorIndex, 2> input_size;
+ input_size[0] = k_;
+ input_size[1] = n_;
+ const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+ b_, input_size);
+ Eigen::array<TensorIndex, 1> output_size;
+ output_size[0] = k_;
+ TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
+ c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+ Eigen::array<TensorIndex, 1> sum_along_dim;
+ sum_along_dim[0] = 1;
+#else
+ // Take advantage of cxx11 to give the compiler information it can use to
+ // optimize the code.
+ Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -223,21 +351,48 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
}
- // do a contraction which is equivalent to a matrix multiplication
- void contraction(int num_iters) {
- const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
- const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
- const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+ // Full reduction
+ void fullReduction(int num_iters) {
+ Eigen::array<TensorIndex, 2> input_size;
+ input_size[0] = k_;
+ input_size[1] = n_;
+ const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
+ b_, input_size);
+ Eigen::array<TensorIndex, 0> output_size;
+ TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
+ c_, output_size);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.sum();
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+ }
- typedef typename Tensor<float, 2>::DimensionPair DimPair;
- const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+ // do a contraction which is equivalent to a matrix multiplication
+ void contraction(int num_iters) {
+ Eigen::array<TensorIndex, 2> sizeA;
+ sizeA[0] = m_;
+ sizeA[1] = k_;
+ Eigen::array<TensorIndex, 2> sizeB;
+ sizeB[0] = k_;
+ sizeB[1] = n_;
+ Eigen::array<TensorIndex, 2> sizeC;
+ sizeC[0] = m_;
+ sizeC[1] = n_;
+
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+ const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+ typedef typename Tensor<T, 2>::DimensionPair DimPair;
+ Eigen::array<DimPair, 1> dims;
+ dims[0] = DimPair(1, 0);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -245,18 +400,25 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
- finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
}
void convolution(int num_iters, int kernel_x, int kernel_y) {
- const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
- const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
- const Eigen::array<TensorIndex, 2> result_sizes(
- m_ - kernel_x + 1, n_ - kernel_y + 1);
- TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
- Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+ Eigen::array<TensorIndex, 2> input_sizes;
+ input_sizes[0] = m_;
+ input_sizes[1] = n_;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
+ Eigen::array<TensorIndex, 2> kernel_sizes;
+ kernel_sizes[0] = kernel_x;
+ kernel_sizes[1] = kernel_y;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+ Eigen::array<TensorIndex, 2> result_sizes;
+ result_sizes[0] = m_ - kernel_x + 1;
+ result_sizes[1] = n_ - kernel_y + 1;
+ TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
+ Eigen::array<TensorIndex, 2> dims;
+ dims[0] = 0;
+ dims[1] = 1;
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -264,42 +426,42 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (kernel_size
// multiplications and additions for each value in the resulting tensor)
- finalizeBenchmark(
- (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+ finalizeBenchmark(static_cast<int64_t>(2) *
+ (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
}
private:
void initialize() {
- a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
- b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
- c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
+ a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+ b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+ c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
// Initialize the content of the memory pools to prevent asan from
// complaining.
- device_.memset(a_, 12, m_ * k_ * sizeof(float));
- device_.memset(b_, 23, k_ * n_ * sizeof(float));
- device_.memset(c_, 31, m_ * n_ * sizeof(float));
+ device_.memset(a_, 12, m_ * k_ * sizeof(T));
+ device_.memset(b_, 23, k_ * n_ * sizeof(T));
+ device_.memset(c_, 31, m_ * n_ * sizeof(T));
- BenchmarkUseRealTime();
+ //BenchmarkUseRealTime();
}
- inline void finalizeBenchmark(int64 num_items) {
+ inline void finalizeBenchmark(int64_t num_items) {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
device_.synchronize();
}
#endif
StopBenchmarkTiming();
- SetBenchmarkItemsProcessed(num_items);
+ SetBenchmarkFlopsProcessed(num_items);
}
- size_t m_;
- size_t k_;
- size_t n_;
- float* a_;
- float* b_;
- float* c_;
+ TensorIndex m_;
+ TensorIndex k_;
+ TensorIndex n_;
+ T* a_;
+ T* b_;
+ T* c_;
Device device_;
};
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
index 68653ba15..8947f4b7f 100644
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -1,35 +1,31 @@
#define EIGEN_USE_THREADS
-#include "base/sysinfo.h"
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-#include "thread/threadpool.h"
+#include <string>
+
+#include "tensor_benchmarks.h"
-#ifdef __ANDROID__
-#define CREATE_THREAD_POOL(threads) \
-Eigen::ThreadPoolDevice device(threads);
-#else
#define CREATE_THREAD_POOL(threads) \
-ThreadPool tp(threads); \
-tp.StartWorkers(); \
-Eigen::ThreadPoolDevice device(&tp, threads);
-#endif
+Eigen::ThreadPool pool(threads); \
+Eigen::ThreadPoolDevice device(&pool, threads);
// Simple functions
-#define BM_FuncCPU(FUNC, THREADS) \
- static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
- StopBenchmarkTiming(); \
- CREATE_THREAD_POOL(THREADS); \
- BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
- suite.FUNC(iters); \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
- } \
+#define BM_FuncCPU(FUNC, THREADS) \
+ static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ CREATE_THREAD_POOL(THREADS); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
+ suite.FUNC(iters); \
+ } \
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
@@ -38,6 +34,14 @@ BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
@@ -66,26 +70,29 @@ BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
-BM_FuncCPU(reduction, 4);
-BM_FuncCPU(reduction, 8);
-BM_FuncCPU(reduction, 12);
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);
// Contractions
-#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
- static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
- StopBenchmarkTiming(); \
- if (THREADS == 1) { \
- Eigen::DefaultDevice device; \
- BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
- suite.FUNC(iters); \
- } else { \
- CREATE_THREAD_POOL(THREADS); \
- BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
- suite.FUNC(iters); \
- } \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
- } \
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
+ static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ if (THREADS == 1) { \
+ Eigen::DefaultDevice device; \
+ BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \
+ suite.FUNC(iters); \
+ } else { \
+ CREATE_THREAD_POOL(THREADS); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+ suite.FUNC(iters); \
+ } \
+ } \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
@@ -107,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
@@ -125,9 +138,8 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
- BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
+ BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
new file mode 100644
index 000000000..d34bd73ca
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -0,0 +1,76 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+// Simple functions
+#define BM_FuncGPU(FUNC) \
+ static void BM_##FUNC(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ Eigen::CudaStreamDevice stream; \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+//BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+//BM_FuncGPU(algebraicFunc);
+//BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
+ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ Eigen::CudaStreamDevice stream; \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+/*BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+*/
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
+ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ Eigen::CudaStreamDevice stream; \
+ Eigen::GpuDevice device(&stream); \
+ BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N); \
+ cudaDeviceSynchronize(); \
+ suite.FUNC(iters, DIM1, DIM2); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+/*
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+*/
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu
index adea754ad..76d68c5c1 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cc
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -3,47 +3,48 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-
+#include "tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
- cudaStream_t stream; \
- cudaStreamCreate(&stream); \
+ Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
- BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
+ BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
- cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
-BM_FuncGPU(reduction);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
- cudaStream_t stream; \
- cudaStreamCreate(&stream); \
+ Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
- BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
+ BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
- cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
@@ -51,19 +52,18 @@ BM_FuncGPU(reduction);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
- cudaStream_t stream; \
- cudaStreamCreate(&stream); \
+ Eigen::CudaStreamDevice stream; \
Eigen::GpuDevice device(&stream); \
- BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
+ BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
- cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);