From cb81975714a96ecb2faf33ca242feeee3543b1db Mon Sep 17 00:00:00 2001
From: Luke Iwanski <luke@codeplay.com>
Date: Mon, 19 Sep 2016 12:44:13 +0100
Subject: Partial OpenCL support via SYCL compatible with ComputeCpp CE.

---
 bench/tensors/README                    |  8 ++++++-
 bench/tensors/tensor_benchmarks_sycl.cc | 37 +++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 bench/tensors/tensor_benchmarks_sycl.cc

(limited to 'bench')
diff --git a/bench/tensors/README b/bench/tensors/README
index 803cb8ef8..3a5fdbe17 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -11,5 +11,11 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
 We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
 nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call 
+last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
+1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
+{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
+2. The host compilation pass that generates the final host binary.
+clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
new file mode 100644
index 000000000..7eca4d966
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -0,0 +1,37 @@
+#define EIGEN_USE_SYCL
+
+#include <SYCL/sycl.hpp>
+#include <iostream>
+
+#include "tensor_benchmarks.h"
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+// Simple functions
+template <typename device_selector>
+cl::sycl::queue sycl_queue() {
+  return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) {
+    for (const auto& e : l) {
+      try {
+        std::rethrow_exception(e);
+      } catch (cl::sycl::exception e) {
+        std::cout << e.what() << std::endl;
+      }
+    }
+  });
+}
+
+#define BM_FuncGPU(FUNC)                                       \
+  static void BM_##FUNC(int iters, int N) {                    \
+    StopBenchmarkTiming();                                     \
+    cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>();  \
+    Eigen::SyclDevice device(q);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                         \
+  }                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
-- 
cgit v1.2.3