From 5e9a1e7a7a7eccbb20a2c4eb44141727b0943f11 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 8 Mar 2017 14:17:48 +0000 Subject: Adding sycl Benchmarks. --- bench/tensors/README | 10 +- bench/tensors/tensor_benchmarks.h | 106 ++++++++++++++++++--- bench/tensors/tensor_benchmarks_sycl.cc | 73 ++++++++++++-- .../tensor_benchmarks_sycl_include_headers.cc | 2 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 33 ++++--- 5 files changed, 184 insertions(+), 40 deletions(-) create mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc diff --git a/bench/tensors/README b/bench/tensors/README index 3a5fdbe17..c4b742749 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu -To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code): +To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc +{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl +clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib +3. Run the benchmark +./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index c2fb3dede..325026113 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -35,6 +35,11 @@ template class BenchmarkSuite { void memcpy(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); @@ -55,7 +60,11 @@ template class BenchmarkSuite { } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.template cast(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.template cast(); @@ -70,7 +79,6 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); - StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -93,7 +101,18 @@ template class BenchmarkSuite { const Eigen::DSizes second_quadrant(0, m_/2); const Eigen::DSizes third_quadrant(m_/2, 0); const Eigen::DSizes fourth_quadrant(m_/2, m_/2); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.slice(first_quadrant, quarter_sizes).device(device_) = @@ -118,7 +137,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % k_, 0); @@ -135,7 +158,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % n_, 1); @@ -158,7 +185,11 @@ template class BenchmarkSuite { Eigen::array shuffle; shuffle[0] = 1; shuffle[1] = 0; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); @@ -186,7 +217,11 @@ template class BenchmarkSuite { paddings[0] = Eigen::IndexPair(0, 0); paddings[1] = Eigen::IndexPair(2, 1); #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.pad(paddings); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.pad(paddings); @@ -216,6 +251,11 @@ template class BenchmarkSuite { Eigen::IndexList, Eigen::type2index<2> > strides; #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.stride(strides); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); @@ -245,6 +285,11 @@ template class BenchmarkSuite { broadcast.set(1, n_); #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.broadcast(broadcast); @@ -261,7 +306,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); @@ -280,6 +329,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl +for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); +} +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); @@ -297,7 +351,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.exp() + B.log(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.exp() + B.log(); @@ -325,7 +383,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -355,7 +417,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -375,7 +441,11 @@ template class BenchmarkSuite { Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(); @@ -404,7 +474,11 @@ template class BenchmarkSuite { typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); @@ -430,7 +504,11 @@ template class BenchmarkSuite { Eigen::array dims; dims[0] = 0; dims[1] = 1; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.convolve(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 6df190869..cb6daac15 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -1,20 +1,73 @@ -#define EIGEN_USE_SYCL +#ifdef EIGEN_USE_SYCL #include #include #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +#endif diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc new file mode 100644 index 000000000..4b3110b85 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -0,0 +1,2 @@ +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc" +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 23297a0a7..5fa7423d0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -149,16 +149,27 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard lock(mutex_); buffer_map.clear(); } - //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// pointer created as a key we find the sycl buffer and get the host accessor with write mode + /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. + /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark + /// Also, this is faster as it uses the map_allocator instead of memcpy template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + auto it =find_buffer(dst); + auto offset =static_cast(static_cast(dst))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto src_buf = cl::sycl::buffer >(static_cast(static_cast(const_cast(src))), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto dst_acc= it->second.template get_access(cgh); + auto src_acc =src_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, offset, 0)); + }); + synchronize(); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -167,7 +178,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it =find_buffer(src); auto offset =static_cast(static_cast(src))- it->first; offset/=sizeof(Index); @@ -186,7 +196,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it1 = find_buffer(static_cast(src)); auto it2 = find_buffer(dst); auto offset= (static_cast(static_cast(src))) - it1->first; @@ -206,7 +215,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - std::lock_guard lock(mutex_); size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); auto it1 = find_buffer(static_cast(data)); @@ -220,18 +228,15 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the function then adds an entry by creating a sycl buffer for that particular pointer. template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - std::lock_guard lock(mutex_); return (find_buffer(ptr)->second.template get_access(cgh)); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - std::lock_guard lock(mutex_); return find_buffer(ptr)->second; } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard lock(mutex_); return (static_cast(ptr))-(find_buffer(ptr)->first); } @@ -375,7 +380,9 @@ private: mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + std::lock_guard lock(mutex_); auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; -- cgit v1.2.3