From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Fri, 28 Jun 2019 10:08:23 +0100
Subject: [SYCL] This PR adds the minimum modifications to the Eigen
 unsupported module required to run it on devices supporting SYCL. *
 Abstracting the pointer type so that both SYCL memory and pointer can be
 captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen
 evaluator class. * Binding SYCL placeholder accessor to command group handler
 by using bind method in Eigen evaluator node. * Adding SYCL macro for
 controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL
 executor method to adopt the above changes.

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       | 153 +++++++++++++++------
 1 file changed, 110 insertions(+), 43 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index bb63433fe..5dddfcf85 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -299,7 +299,7 @@ template <typename Self, typename Op, typename Device, bool Vectorizable = (Self
 struct FullReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
@@ -400,6 +400,18 @@ struct OuterReducer {
   }
 };
 
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+#endif
 
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I_>
@@ -423,6 +435,23 @@ template <int NPT, typename S, typename R, typename I_>
 __global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if EIGEN_HAS_CXX11 && defined(EIGEN_USE_SYCL)
+  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
+#else
+  typedef typename remove_const<CoeffReturnType>::type type;
+#endif
+};
+
 template <typename Self, typename Op,
           bool Vectorizable =
               (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
@@ -520,12 +549,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
     const Op m_reducer;
 };
 
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
 
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
   typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -535,12 +567,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int NumOutputDims = NumInputDims - NumReducedDims;
   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
 
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+    // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+
   enum {
     IsAligned = false,
     PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
@@ -562,11 +602,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
-#if defined(EIGEN_USE_SYCL)
-      , m_xpr_dims(op.dims())
-#endif
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -653,7 +690,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     // of which will eventually result in an NVCC error
     EIGEN_DEVICE_FUNC
     #endif
-    bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+    bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
@@ -663,7 +700,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          !RunningOnGPU))) {
       bool need_assign = false;
       if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType)));
+        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
         data = m_result;
         need_assign = true;
       }
@@ -671,20 +708,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
       return need_assign;
     }
-    else if(RunningOnSycl){
-      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-      if (!data) {
-        data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-        m_result = data;
-      }
-      Op reducer(m_reducer);
-      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-      return (m_result != NULL);
-    }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -698,8 +724,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -707,6 +733,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
             m_device.deallocate_temp(m_result);
@@ -731,8 +758,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -740,6 +767,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
             m_device.deallocate_temp(m_result);
@@ -750,6 +778,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           return (m_result != NULL);
         }
       }
+      #if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression 
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if(RunningOnSycl) {
+         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+         if (!data) {
+           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+           m_result = data;
+         }
+         Op reducer(m_reducer);
+         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+         return (m_result != NULL);
+       }
+      #endif
     }
     return true;
   }
@@ -764,7 +807,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
       return *(m_result + index);
     }
     Op reducer(m_reducer);
@@ -1097,12 +1140,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     m_device.deallocate(reducers);
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
-
-#if defined(EIGEN_USE_SYCL)
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  const Device& device() const { return m_device; }
-  const Dims& xprDims() const { return m_xpr_dims; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_result.bind(cgh);
+  }
 #endif
 
   private:
@@ -1126,8 +1172,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 #endif
 
 #if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
 #endif
 
 
@@ -1255,9 +1302,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
-  // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
   array<Index, NumPreservedStrides> m_preservedStrides;
   // Map from output to input dimension index.
   array<Index, NumOutputDims> m_output_to_input_dim_map;
@@ -1288,13 +1332,36 @@ static const bool RunningOnGPU = false;
   static const bool RunningOnGPU = false;
   static const bool RunningOnSycl = false;
 #endif
-  typename MakePointer_<CoeffReturnType>::Type m_result;
+  EvaluatorPointerType m_result;
 
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
+};
 
-#if defined(EIGEN_USE_SYCL)
-  const Dims m_xpr_dims;
-#endif
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the packet function should be overridden by for SYCL kernel
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
 };
 
 } // end namespace Eigen
-- 
cgit v1.2.3