aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench/tensors
diff options
context:
space:
mode:
authorGravatar Luke Iwanski <luke@codeplay.com>2016-09-19 12:44:13 +0100
committerGravatar Luke Iwanski <luke@codeplay.com>2016-09-19 12:44:13 +0100
commitcb81975714a96ecb2faf33ca242feeee3543b1db (patch)
treefebc8730a60a48572cb293696c170d7cb50a4728 /bench/tensors
parent59bacfe5201b54a6303b79bb538671d04f91dbce (diff)
Partial OpenCL support via SYCL compatible with ComputeCpp CE.
Diffstat (limited to 'bench/tensors')
-rw-r--r--bench/tensors/README8
-rw-r--r--bench/tensors/tensor_benchmarks_sycl.cc37
2 files changed, 44 insertions, 1 deletions
diff --git a/bench/tensors/README b/bench/tensors/README
index 803cb8ef8..3a5fdbe17 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -11,5 +11,11 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
+1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
+{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
+2. The host compilation pass that generates the final host binary.
+clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
new file mode 100644
index 000000000..7eca4d966
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -0,0 +1,37 @@
+#define EIGEN_USE_SYCL
+
+#include <SYCL/sycl.hpp>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+// Simple functions
+template <typename device_selector>
+cl::sycl::queue sycl_queue() {
+ return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) {
+ for (const auto& e : l) {
+ try {
+ std::rethrow_exception(e);
+ } catch (cl::sycl::exception e) {
+ std::cout << e.what() << std::endl;
+ }
+ }
+ });
+}
+
+#define BM_FuncGPU(FUNC) \
+ static void BM_##FUNC(int iters, int N) { \
+ StopBenchmarkTiming(); \
+ cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>(); \
+ Eigen::SyclDevice device(q); \
+ BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+ suite.FUNC(iters); \
+ } \
+ BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);