Adding Tensor ReverseOp; TensorStriding; TensorConversionOp; Modifying Tensor Contractsycl to be located in any place in the expression tree.

author: Mehdi Goli <mehdi.goli@codeplay.com> 2017-01-16 13:58:49 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2017-01-16 13:58:49 +0000
commit: e46e7223817cfd982edec6d8e25c77e8e2493d78 (patch)
tree: 3b8345ae7bb7ab2434b117932aea51f016acf43d /unsupported/Eigen/CXX11
parent: 23778a15d8570b4287820f540b719203e07cfb44 (diff)
17 files changed, 299 insertions, 120 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 2ac6abf69..1b8017349 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -156,7 +156,7 @@ struct TensorContractionEvaluatorBase
     m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
-        m_result(NULL), m_expr_indices(op.indices()) {
+        m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
          static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -564,9 +564,6 @@ struct TensorContractionEvaluatorBase
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
   const Device& m_device;
   Scalar* m_result;
-  /// required for sycl
-  const Indices m_expr_indices;
-
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index b170a1a5c..dc16f89e0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
     this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-  LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
-   this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
-   this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
+    LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
+                       this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
+                       this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
   }
   // required by sycl to construct the expr on the device. Returns original left_impl
   const TensorEvaluator<LeftArgType, Device>& left_impl() const {
@@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   const TensorEvaluator<RightArgType, Device>& right_impl() const {
     return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
   }
-  // required by sycl to construct the expr on the device
-  const Indices& indices() const {return this->m_expr_indices;}
 };
 
-/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
-template<typename Expr> struct TensorEvaluatorContainer;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
-  typedef Eigen::DefaultDevice Device;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
-  : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                        op.lhsExpression(), op.rhsExpression()), device),
-  m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                        op.rhsExpression(), op.lhsExpression()), device){}
-LeftEvaluator  m_leftImpl;
-RightEvaluator m_rightImpl;
-};
-
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
+template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
 typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
 int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
-int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
-
-  typedef  typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-
-  FunctorExpr functors;
+int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
+  typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+  typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
+  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
+  LHSFunctorExpr lhs_functors;
+  RHSFunctorExpr rhs_functors;
   LhsLocalAcc localLhs;
   RhsLocalAcc localRhs;
   OutAccessor out_res;
@@ -206,38 +177,50 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
   ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
   LeftNocontractT m_i_strides, m_left_nocontract_strides;
   RightNocontractT m_j_strides,  m_right_nocontract_strides;
-  TupleType tuple_of_accessors;
+  LHSTupleType left_tuple_of_accessors;
+  RHSTupleType right_tuple_of_accessors;
+  Device dev;
+
 
-  KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
     Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
     ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
-    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
-    :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
+    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
     m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
     m_right_contracting_strides(m_right_contracting_strides_),
     m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
     m_j_strides(m_j_strides_),  m_right_nocontract_strides(m_right_nocontract_strides_),
-    tuple_of_accessors(tuple_of_accessors_){}
+    left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
 
     void operator()(cl::sycl::nd_item<1> itemID) {
-      typedef  typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-      auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-      auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
-      typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
+      auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
+      auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
+      typedef decltype(lhs_dev_expr.expr) LeftArgType;
+      typedef decltype(rhs_dev_expr.expr) RightArgType;
+      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+      typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+      typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
       typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                     typename DevEvaluator::LeftEvaluator, LeftNocontractT,
+                                                      LeftEvaluator, LeftNocontractT,
                                                      ContractT, 1,
                                                      lhs_inner_dim_contiguous,
                                                      false, Unaligned, MakeGlobalPointer> LhsMapper;
 
       typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                     typename DevEvaluator::RightEvaluator, RightNocontractT,
+                                                     RightEvaluator, RightNocontractT,
                                                      ContractT, 1,
                                                      rhs_inner_dim_contiguous,
                                                      rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
       // initialize data mappers must happen inside the kernel for device eval
-      LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
-      RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
+      LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+                    lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
+      RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+                    rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
       auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
       // Matmul Kernel
       // Thread identifiers
@@ -327,7 +310,6 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
           firstHalf++;
       } while (firstHalf<numTiles);
 
-
       // Store the final results in C
       for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
           int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
@@ -364,35 +346,52 @@ template< typename Self, typename OutScalar, typename Index, typename ContractT,
   static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
     ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
     LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-    // create a tuple of accessors from Evaluator
+
     typedef typename Self::XprType HostExpr;
-  //  typedef  typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  //  typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
-    auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
-    typedef decltype(functors) FunctorExpr;
+    typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+    typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+    typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
+    typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
+    // extract lhs functor list
+    LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+    // extract rhs functor list
+    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+
     Index roundUpK = RoundUp(K, TileSizeDimK);
     Index roundUpM = RoundUp(M, TileSizeDimM);
     Index roundUpN = RoundUp(N, TileSizeDimN);
+
     self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
-      typedef decltype(tuple_of_accessors) TupleType;
+      /// work-around for gcc bug
+      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
+      /// work-around for gcc bug
+      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
+      // create lhs tuple of accessors
+      LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
+      // create rhs tuple of accessors
+      RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
+
       // Local memory for elements of Lhs
       typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
       LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
       // Local memory for elements of Rhs
       typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
       RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
+
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
       //OutScalar memory
-      auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-      typedef decltype(out_res) OutAccessor;
+      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
+
       // sycl parallel for
       cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
       cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
-       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar,  FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
+       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
        RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
-       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
+       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
           localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
-          m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
+          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
     });
     self.device().asynchronousExec();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 860a6949a..b29968b63 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
   protected:
   template <int LoadMode, bool ActuallyVectorize>
   struct PacketConv {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 930837021..822e22c2d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -26,8 +26,8 @@ namespace Eigen {
 /// Therefore, by adding the default value, we managed to convert the type and it does not break any
 /// existing code as its default value is T*.
 namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
   enum {
     Flags = 0
   };
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-    typedef typename MakePointerT::RefType RefType;
-
-  };
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+  typedef const TensorForcedEvalOp<XprType>& type;
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
 {
-  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+  typedef TensorForcedEvalOp<XprType> type;
 };
 
 }  // end namespace internal
 
 
 
-template<typename XprType, template <class> class MakePointer_>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
 };
 
 
-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
 {
-  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename XprType::Index Index;
@@ -150,7 +143,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+  CoeffReturnType* data() const { return m_buffer; }
 
   /// required by sycl in order to extract the sycl accessor
   const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
@@ -160,7 +153,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
   const Device& m_device;
-  typename MakePointer<CoeffReturnType>::Type m_buffer;
+  CoeffReturnType* m_buffer;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 9a012c176..2e638992a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 485a082e2..ef1c9c42c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
 #ifdef __CUDA_ARCH__
     return (__umulhi(magic, n) >> shift);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d582ccbe1..dbe11c7af 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -730,12 +736,22 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
       if(m_strides[i]>0){
+        #ifndef __SYCL_DEVICE_ONLY__
         startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+        #else
+        startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
+        stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
+        #endif
       }else{
-        /* implies m_strides[i]<0 by assert */
+      /* implies m_strides[i]<0 by assert */
+      #ifndef __SYCL_DEVICE_ONLY__
         startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        #else
+        startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
+        stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
+        #endif
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
@@ -796,13 +812,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
                                           sizeof(Scalar));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 319417687..82ca71215 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -74,7 +74,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
     FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
     int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
     size_t inputSize =self.impl().dimensions().TotalSize();
@@ -108,9 +108,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
   //  Dims dims= self.xprDims();
     //Op functor = reducer;
     dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+      // this is a work around for gcc bug
+      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
       // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      typedef decltype(tuple_of_accessors) TupleType;
+      TupleType tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
       typedef decltype(tmp_global_accessor) OutAccessor;
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
@@ -136,7 +137,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
     FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
     typename Self::Index range, GRange, tileSize;
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
@@ -147,9 +148,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
     /// recursively apply reduction on it in order to reduce the whole.
       dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
       dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+      // this is work around for gcc bug.
+      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
       // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
+      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
 
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14e392e36..e430b0826 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from sycl device
+  ReverseDimensions functor() const { return m_reverse; }
+
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 6c35bfdb6..93615e5c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -117,11 +117,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+      : m_impl(op.expression(), device), m_strides(op.strides())
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
+#ifndef __SYCL_DEVICE_ONLY__
       m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+#else
+      m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+#endif
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -224,6 +228,13 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// required by sycl in order to extract the accessor
+  Strides functor() const { return m_strides; }
+
+
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
@@ -250,6 +261,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
 };
 
 
@@ -286,6 +298,12 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
+  /// required by sycl in order to extract the accessor
+  Strides functor() const { return this->m_strides; }
+
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index 113dd2557..29f362ade 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -97,8 +97,18 @@ template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
 : DeviceConvertor<ExprNode, Res, Expr>{};
 
-KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
-KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
+template <typename Expr>\
+struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
+  typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
+};
+KERNELBROKERCONVERTFORCEDEVAL(const)
+KERNELBROKERCONVERTFORCEDEVAL()
+#undef KERNELBROKERCONVERTFORCEDEVAL
+
+
+
 KERNELBROKERCONVERT(const, true, TensorEvalToOp)
 KERNELBROKERCONVERT(, false, TensorEvalToOp)
 #undef KERNELBROKERCONVERT
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index df1a732e7..56ba82805 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual
  ASSIGN(const)
  ASSIGN()
  #undef ASSIGN
+
+
+
+
+ /// specialisation of the \ref ExprConstructor struct when the node type is
+ /// const TensorAssignOp
+ #define CONVERSIONEXPRCONST(CVQual)\
+ template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
+ struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>,  CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
+   typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
+   typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type>  Type;\
+   my_nested_type nestedExpr;\
+   Type expr;\
+   template <typename FuncDetector>\
+   ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+   : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
+  };
+
+  CONVERSIONEXPRCONST(const)
+  CONVERSIONEXPRCONST()
+  #undef CONVERSIONEXPRCONST
+
 /// specialisation of the \ref ExprConstructor struct when the node type is
 ///  TensorEvalToOp /// 0 here is the output number in the buffer
 #define EVALTO(CVQual)\
@@ -212,10 +234,10 @@ EVALTO()
 /// TensorForcedEvalOp
 #define FORCEDEVAL(CVQual)\
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
-  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
+  TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
   Type expr;\
   template <typename FuncDetector>\
   ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR()
 #undef SYCLREDUCTIONEXPR
 
 
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorContractionOp
+#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
+struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
+CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
+  static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
+  typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
+  NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
+  typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
+  Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+};
+
+SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTION
+
+
 
 #define SYCLSLICEOPEXPR(CVQual)\
 template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index 876fcd45e..e4658eda5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -194,6 +194,23 @@ SYCLREDUCTIONEXTACC(const)
 SYCLREDUCTIONEXTACC()
 #undef SYCLREDUCTIONEXTACC
 
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
+#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
+ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
+  }\
+};
+
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
+
+
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// const TensorSlicingOp. This is a special case where there is no OP
 #define SYCLSLICEOPEXTACC(CVQual)\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index 6f9ab57af..e26cbdf6d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{
 
 };
 
+/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
+///TensorConversionOp
+#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
+template <typename ArgType1, typename ArgType2, typename Dev>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
+  FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
+  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
+  : subExpr(expr.impl()) {}\
+};
+
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
+#undef SYCLEXTRFUNCCONVERSION
+
 #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
 template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
 struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
@@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const)
 SYCLEXTRFUNCREDUCTIONOP()
 #undef SYCLEXTRFUNCREDUCTIONOP
 
+#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
+  typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
+  typedef typename Evaluator::Dimensions Dimensions;\
+  const Dimensions m_dimensions;\
+  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
+  : m_dimensions(expr.dimensions()) {}\
+};
+
+
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
+#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
+
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCTSLICEOP(CVQual)\
@@ -253,9 +285,6 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>,
   : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
 };
 
-// TensorContractionOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
 // TensorConcatenationOp
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index 37fe196ea..0ac51e7bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const)
 REDUCTIONLEAFCOUNT()
 #undef REDUCTIONLEAFCOUNT
 
+/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
+#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
+    static const size_t Count =1;\
+};
+
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
+#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
+
+
+
 /// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
 #define SLICEOPLEAFCOUNT(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 4419a1780..f6e3b4766 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -169,6 +169,20 @@ SYCLREDUCTION()
 
 
 /// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorReductionOp
+#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
+  typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
+};
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONPLH
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorCwiseSelectOp
 #define SLICEOPEXPR(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index 32930be26..6ce41b0ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -49,19 +49,39 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
 /// based expression tree;
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
+/// std::array does not have TotalSize. So I have to get the size throgh template specialisation.
+template<typename Index, typename Dimensions> struct DimensionSize{
+  static Index getDimSize(const Dimensions& dim){
+    return dim.TotalSize();
+
+  }
+};
+#define DIMSIZEMACRO(CVQual)\
+template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
+  static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
+    return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
+  }\
+};
+
+DIMSIZEMACRO(const)
+DIMSIZEMACRO()
+#undef DIMSIZEMACRO
+
+
 template <typename Expr, typename Dev>
 void run(Expr &expr, Dev &dev) {
   Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
     FunctorExpr functors = internal::extractFunctors(evaluator);
     dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
-      typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType;
-      TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+      typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
+      TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
       typename Expr::Index range, GRange, tileSize;
-      dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
+      typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
+      dev.parallel_for_setup(total_size, tileSize, range, GRange);
 
       cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
       ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
author	Mehdi Goli <mehdi.goli@codeplay.com>	2017-01-16 13:58:49 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2017-01-16 13:58:49 +0000
commit	e46e7223817cfd982edec6d8e25c77e8e2493d78 (patch)
tree	3b8345ae7bb7ab2434b117932aea51f016acf43d /unsupported/Eigen/CXX11
parent	23778a15d8570b4287820f540b719203e07cfb44 (diff)