Merged in benoitsteiner/opencl (pull request PR-309)

OpenCL improvements
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2017-04-05 14:28:08 +0000
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2017-04-05 14:28:08 +0000
commit: 0d08165a7f7a95c35dead32ec2d567e9a4b609b0 (patch)
tree: ac7c8a2f553056f3f3da70592fa821cec8aa883d /bench
parent: 4910630c968f9803bef5572bf7f17767c2ef09f1 (diff)
parent: 068cc0970890b534d65dbc99e6b5795acbaaa801 (diff)
4 files changed, 169 insertions, 27 deletions
diff --git a/bench/tensors/README b/bench/tensors/README
index 3a5fdbe17..69342cc9c 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D
 last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
 
-To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
+To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code):
 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
-{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
+{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1
 2. The host compilation pass that generates the final host binary.
-clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
+clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o
+clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I  {ComputeCpp_ROOT}/include/ -L  {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl
+export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib
+3. Run the benchmark
+./tensor_benchmark_sycl
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index c2fb3dede..3a640ede4 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -35,6 +35,11 @@ template <typename Device, typename T> class BenchmarkSuite {
 
   void memcpy(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
@@ -55,7 +60,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     }
     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.template cast<T>();
@@ -70,7 +79,6 @@ template <typename Device, typename T> class BenchmarkSuite {
     sizes[0] = m_;
     sizes[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = C.random();
@@ -93,7 +101,18 @@ template <typename Device, typename T> class BenchmarkSuite {
     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.slice(first_quadrant, quarter_sizes).device(device_) =
@@ -118,7 +137,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = n_;
     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.chip(iter % k_, 0);
@@ -135,7 +158,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = n_;
     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.chip(iter % n_, 1);
@@ -158,7 +185,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<int, 2> shuffle;
     shuffle[0] = 1;
     shuffle[1] = 0;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.shuffle(shuffle);
@@ -186,7 +217,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.pad(paddings);
@@ -216,6 +251,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
 #endif
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.stride(strides);
@@ -245,6 +285,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     broadcast.set(1, n_);
 #endif
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.broadcast(broadcast);
@@ -261,7 +306,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
@@ -280,6 +329,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+}
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
@@ -297,7 +351,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.exp() + B.log();
@@ -325,7 +383,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     // optimize the code.
     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.sum(sum_along_dim);
@@ -355,7 +417,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     // optimize the code.
     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.sum(sum_along_dim);
@@ -375,7 +441,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 0> output_size;
     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.sum();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.sum();
@@ -404,7 +474,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     typedef typename Tensor<T, 2>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims;
     dims[0] = DimPair(1, 0);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.contract(B, dims);
@@ -430,7 +504,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 2> dims;
     dims[0] = 0;
     dims[1] = 1;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+     }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.convolve(B, dims);
@@ -461,6 +539,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
       device_.synchronize();
     }
+#elif defined(EIGEN_USE_SYCL)
+    if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
+      device_.synchronize();
+    }
+
 #endif
     StopBenchmarkTiming();
     SetBenchmarkFlopsProcessed(num_items);
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
index 6df190869..cb6daac15 100644
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -1,20 +1,73 @@
-#define EIGEN_USE_SYCL
+#ifdef EIGEN_USE_SYCL
 
 #include <SYCL/sycl.hpp>
 #include <iostream>
 
 #include "tensor_benchmarks.h"
 
-#define BM_FuncGPU(FUNC)                                       \
-  static void BM_##FUNC(int iters, int N) {                    \
-    StopBenchmarkTiming();                                     \
-    cl::sycl::gpu_selector selector; \
-    Eigen::QueueInterface queue(selector);     \
-    Eigen::SyclDevice device(&queue);                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
-    suite.FUNC(iters);                                         \
-  }                                                            \
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3);        \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+#endif
diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
new file mode 100644
index 000000000..bcc3c4c79
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
@@ -0,0 +1,2 @@
+#include "tensor_benchmarks_sycl.cc"
+#include "tensor_benchmarks_sycl.sycl"
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2017-04-05 14:28:08 +0000
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2017-04-05 14:28:08 +0000
commit	0d08165a7f7a95c35dead32ec2d567e9a4b609b0 (patch)
tree	ac7c8a2f553056f3f3da70592fa821cec8aa883d /bench
parent	4910630c968f9803bef5572bf7f17767c2ef09f1 (diff)
parent	068cc0970890b534d65dbc99e6b5795acbaaa801 (diff)