aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
commit00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree792e46110f0751ea8802fa9d403d1472d5977ac3 /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
parentea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h245
1 files changed, 172 insertions, 73 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6f8b6f193..df591c21d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,7 +16,6 @@
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
#include <unordered_set>
-
namespace Eigen {
namespace TensorSycl {
@@ -70,9 +69,9 @@ struct SyclDeviceInfo {
} // end namespace TensorSycl
typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
-// TensorFlow via the Eigen SYCL Backend.
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
-> decltype(cl::sycl::device::get_devices()) {
#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
@@ -421,6 +420,91 @@ class QueueInterface {
return pMapper.get_offset(ptr);
}
+ template <typename OutScalar, typename sycl_kernel, typename Lhs,
+ typename Rhs, typename OutPtr, typename Range, typename Index,
+ typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+ const Rhs &rhs, OutPtr outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ lhs.bind(cgh);
+ rhs.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename OutPtr, typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+ OutPtr &outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+
EIGEN_STRONG_INLINE void synchronize() const {
#ifdef EIGEN_EXCEPTIONS
m_queue.wait_and_throw();
@@ -429,6 +513,7 @@ class QueueInterface {
#endif
}
+
EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
set_latest_event(e);
#ifndef EIGEN_SYCL_ASYNC_EXECUTION
@@ -457,11 +542,10 @@ class QueueInterface {
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
- Index &tileSize0,
- Index &tileSize1, Index &rng0,
- Index &rng1, Index &GRange0,
- Index &GRange1) const {
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ std::array<Index, 2> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
@@ -469,26 +553,28 @@ class QueueInterface {
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize1 =
+ local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
- rng1 = dim1;
- if (rng1 == 0) rng1 = static_cast<Index>(1);
- GRange1 = rng1;
- if (tileSize1 > GRange1)
- tileSize1 = GRange1;
- else if (GRange1 > tileSize1) {
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
- tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
- rng0 = dim0;
- if (rng0 == 0) rng0 = static_cast<Index>(1);
- GRange0 = rng0;
- if (tileSize0 > GRange0)
- tileSize0 = GRange0;
- else if (GRange0 > tileSize0) {
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
@@ -496,9 +582,9 @@ class QueueInterface {
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
- Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
- Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
- Index &GRange1, Index &GRange2) const {
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ std::array<Index, 3> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
@@ -506,45 +592,48 @@ class QueueInterface {
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize2 =
+ local_range[2] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
- rng2 = dim2;
- if (rng2 == 0) rng1 = static_cast<Index>(1);
- GRange2 = rng2;
- if (tileSize2 > GRange2)
- tileSize2 = GRange2;
- else if (GRange2 > tileSize2) {
- Index xMode = static_cast<Index>(GRange2 % tileSize2);
- if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+ input_range[2] = input_dim[2];
+ if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[2] = input_range[2];
+ if (local_range[2] > global_range[2])
+ local_range[2] = global_range[2];
+ else if (global_range[2] > local_range[2]) {
+ Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+ if (xMode != 0)
+ global_range[2] += static_cast<Index>(local_range[2] - xMode);
}
pow_of_2 = static_cast<Index>(
- std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
- tileSize1 =
+ std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+ local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
- rng1 = dim1;
- if (rng1 == 0) rng1 = static_cast<Index>(1);
- GRange1 = rng1;
- if (tileSize1 > GRange1)
- tileSize1 = GRange1;
- else if (GRange1 > tileSize1) {
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
- tileSize0 =
- static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
- rng0 = dim0;
- if (rng0 == 0) rng0 = static_cast<Index>(1);
- GRange0 = rng0;
- if (tileSize0 > GRange0)
- tileSize0 = GRange0;
- else if (GRange0 > tileSize0) {
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ local_range[0] = static_cast<Index>(max_workgroup_Size /
+ (local_range[1] * local_range[2]));
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
return false;
#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
return true;
@@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase {
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
- Index &tileSize0,
- Index &tileSize1, Index &rng0,
- Index &rng1, Index &GRange0,
- Index &GRange1) const {
- queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
- rng1, GRange0, GRange1);
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
- Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
- Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
- Index &GRange1, Index &GRange2) const {
- queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
- tileSize2, rng0, rng1, rng2, GRange0,
- GRange1, GRange2);
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// allocate device memory
@@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase {
EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
return queue_stream()->getDeviceVendor();
}
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+ queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+ queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+ queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
};
} // end namespace Eigen