From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 28 Nov 2019 10:08:54 +0000
Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream
 master branch. * Unifying all loadLocalTile from lhs and rhs to an
 extract_block function. * Adding get_tensor operation which was missing in
 TensorContractionMapper. * Adding the -D method missing from cmake for
 Disable_Skinny Contraction operation. * Wrapping all the indices in
 TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL *
 Unifying load to private register for tall/skinny no shared * Unifying load
 to vector tile for tensor-vector/vector-tensor operation * Removing all the
 LHS/RHS class for extracting data from global * Removing Outputfunction from
 TensorContractionSkinnyNoshared. * Combining the local memory version of
 tall/skinny and normal tensor contraction into one kernel. * Combining the
 no-local memory version of tall/skinny and normal tensor contraction into one
 kernel. * Combining General Tensor-Vector and VectorTensor contraction into
 one kernel. * Making double buffering optional for Tensor contraction when
 local memory is version is used. * Modifying benchmark to accept custom
 Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to
 allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL
 CMake

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h      | 245 +++++++++++++++------
 1 file changed, 172 insertions(+), 73 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6f8b6f193..df591c21d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,7 +16,6 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
 #include <unordered_set>
 
-
 namespace Eigen {
 
 namespace TensorSycl {
@@ -70,9 +69,9 @@ struct SyclDeviceInfo {
 }  // end namespace TensorSycl
 
 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and 
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently 
-// TensorFlow via the Eigen SYCL Backend. 
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
     -> decltype(cl::sycl::device::get_devices()) {
 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
@@ -421,6 +420,91 @@ class QueueInterface {
     return pMapper.get_offset(ptr);
   }
 
+  template <typename OutScalar, typename sycl_kernel, typename Lhs,
+            typename Rhs, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+                                                  const Rhs &rhs, OutPtr outptr,
+                                                  Range thread_range,
+                                                  Index scratchSize,
+                                                  T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      lhs.bind(cgh);
+      rhs.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename OutPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+                                                 OutPtr &outptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+    template <typename OutScalar, typename sycl_kernel, typename InPtr,
+           typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+
   EIGEN_STRONG_INLINE void synchronize() const {
 #ifdef EIGEN_EXCEPTIONS
     m_queue.wait_and_throw();
@@ -429,6 +513,7 @@ class QueueInterface {
 #endif
   }
 
+
   EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
     set_latest_event(e);
 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
@@ -457,11 +542,10 @@ class QueueInterface {
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
-                                              Index &tileSize0,
-                                              Index &tileSize1, Index &rng0,
-                                              Index &rng1, Index &GRange0,
-                                              Index &GRange1) const {
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
     Index max_workgroup_Size =
         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
     max_workgroup_Size =
@@ -469,26 +553,28 @@ class QueueInterface {
                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
                  static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =
+    local_range[1] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
-    GRange1 = rng1;
-    if (tileSize1 > GRange1)
-      tileSize1 = GRange1;
-    else if (GRange1 > tileSize1) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
     }
-    tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
-    rng0 = dim0;
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
-    GRange0 = rng0;
-    if (tileSize0 > GRange0)
-      tileSize0 = GRange0;
-    else if (GRange0 > tileSize0) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
     }
   }
 
@@ -496,9 +582,9 @@ class QueueInterface {
   /// threads per block for sycl kernels
   template <typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
-      Index &GRange1, Index &GRange2) const {
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
     Index max_workgroup_Size =
         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
     max_workgroup_Size =
@@ -506,45 +592,48 @@ class QueueInterface {
                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
                  static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =
+    local_range[2] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
-    rng2 = dim2;
-    if (rng2 == 0) rng1 = static_cast<Index>(1);
-    GRange2 = rng2;
-    if (tileSize2 > GRange2)
-      tileSize2 = GRange2;
-    else if (GRange2 > tileSize2) {
-      Index xMode = static_cast<Index>(GRange2 % tileSize2);
-      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0)
+        global_range[2] += static_cast<Index>(local_range[2] - xMode);
     }
     pow_of_2 = static_cast<Index>(
-        std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
-    tileSize1 =
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
-    GRange1 = rng1;
-    if (tileSize1 > GRange1)
-      tileSize1 = GRange1;
-    else if (GRange1 > tileSize1) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
     }
-    tileSize0 =
-        static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
-    rng0 = dim0;
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
-    GRange0 = rng0;
-    if (tileSize0 > GRange0)
-      tileSize0 = GRange0;
-    else if (GRange0 > tileSize0) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
+                                        (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
     }
   }
 
   EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
     return false;
 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
     return true;
@@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase {
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
-                                              Index &tileSize0,
-                                              Index &tileSize1, Index &rng0,
-                                              Index &rng1, Index &GRange0,
-                                              Index &GRange1) const {
-    queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
-                                       rng1, GRange0, GRange1);
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
   }
 
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
-      Index &GRange1, Index &GRange2) const {
-    queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
-                                       tileSize2, rng0, rng1, rng2, GRange0,
-                                       GRange1, GRange2);
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
   }
 
   /// allocate device memory
@@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase {
   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
     return queue_stream()->getDeviceVendor();
   }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
 };
 }  // end namespace Eigen
 
-- 
cgit v1.2.3