diff options
Diffstat (limited to 'unsupported')
22 files changed, 826 insertions, 122 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2ac6abf69..1b8017349 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -156,7 +156,7 @@ struct TensorContractionEvaluatorBase m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), - m_result(NULL), m_expr_indices(op.indices()) { + m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -564,9 +564,6 @@ struct TensorContractionEvaluatorBase TensorEvaluator<EvalRightArgType, Device> m_rightImpl; const Device& m_device; Scalar* m_result; - /// required for sycl - const Indices m_expr_indices; - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index b170a1a5c..dc16f89e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); + LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, + this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, + this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); } // required by sycl to construct the expr on the device. Returns original left_impl const TensorEvaluator<LeftArgType, Device>& left_impl() const { @@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const TensorEvaluator<RightArgType, Device>& right_impl() const { return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); } - // required by sycl to construct the expr on the device - const Indices& indices() const {return this->m_expr_indices;} }; -/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster. -template<typename Expr> struct TensorEvaluatorContainer; -template<typename Indices, typename LeftArgType, typename RightArgType> -struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{ - typedef Eigen::DefaultDevice Device; - typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType; - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - }; - - typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device) - : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device){} -LeftEvaluator m_leftImpl; -RightEvaluator m_rightImpl; -}; - - -template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, +template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN, -int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{ - - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - - FunctorExpr functors; +int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ + typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr; + LHSFunctorExpr lhs_functors; + RHSFunctorExpr rhs_functors; LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; @@ -206,38 +177,50 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; RightNocontractT m_j_strides, m_right_nocontract_strides; - TupleType tuple_of_accessors; + LHSTupleType left_tuple_of_accessors; + RHSTupleType right_tuple_of_accessors; + Device dev; + - KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_) - :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - tuple_of_accessors(tuple_of_accessors_){} + left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice()); - typedef TensorEvaluatorContainer<DevExpr> DevEvaluator; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr; + auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors); + auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors); + typedef decltype(lhs_dev_expr.expr) LeftArgType; + typedef decltype(rhs_dev_expr.expr) RightArgType; + typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - typename DevEvaluator::LeftEvaluator, LeftNocontractT, + LeftEvaluator, LeftNocontractT, ContractT, 1, lhs_inner_dim_contiguous, false, Unaligned, MakeGlobalPointer> LhsMapper; typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - typename DevEvaluator::RightEvaluator, RightNocontractT, + RightEvaluator, RightNocontractT, ContractT, 1, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), + lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), + rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); // Matmul Kernel // Thread identifiers @@ -327,7 +310,6 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr firstHalf++; } while (firstHalf<numTiles); - // Store the final results in C for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; @@ -364,35 +346,52 @@ template< typename Self, typename OutScalar, typename Index, typename ContractT, static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - // create a tuple of accessors from Evaluator + typedef typename Self::XprType HostExpr; - // typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - // typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName; - auto functors = Eigen::TensorSycl::internal::extractFunctors(self); - typedef decltype(functors) FunctorExpr; + typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; + typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr; + typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr; + // extract lhs functor list + LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + // extract rhs functor list + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); + self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self); - typedef decltype(tuple_of_accessors) TupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType; + // create lhs tuple of accessors + LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl()); + // create rhs tuple of accessors + RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl()); + // Local memory for elements of Lhs typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc; LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); // Local memory for elements of Rhs typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); + + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; //OutScalar memory - auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - typedef decltype(out_res) OutAccessor; + OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); + // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, + KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors)); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 860a6949a..b29968b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + protected: template <int LoadMode, bool ActuallyVectorize> struct PacketConv { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 930837021..822e22c2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,8 +26,8 @@ namespace Eigen { /// Therefore, by adding the default value, we managed to convert the type and it does not break any /// existing code as its default value is T*. namespace internal { -template<typename XprType, template <class> class MakePointer_> -struct traits<TensorForcedEvalOp<XprType, MakePointer_> > +template<typename XprType> +struct traits<TensorForcedEvalOp<XprType> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> > enum { Flags = 0 }; - template <class T> struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_<T> MakePointerT; - typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - - }; }; -template<typename XprType, template <class> class MakePointer_> -struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense> +template<typename XprType> +struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> { - typedef const TensorForcedEvalOp<XprType, MakePointer_>& type; + typedef const TensorForcedEvalOp<XprType>& type; }; -template<typename XprType, template <class> class MakePointer_> -struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type> +template<typename XprType> +struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> { - typedef TensorForcedEvalOp<XprType, MakePointer_> type; + typedef TensorForcedEvalOp<XprType> type; }; } // end namespace internal -template<typename XprType, template <class> class MakePointer_> -class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors> +template<typename XprType> +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; @@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi }; -template<typename ArgType, typename Device, template <class> class MakePointer_> -struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> { - typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType; + typedef TensorForcedEvalOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -150,7 +143,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; } + CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } @@ -160,7 +153,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; const Device& m_device; - typename MakePointer<CoeffReturnType>::Type m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 9a012c176..2e638992a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp; -template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp; +template<typename XprType> class TensorForcedEvalOp; template<typename ExpressionType, typename DeviceType> class TensorDevice; template<typename Derived, typename Device> struct TensorEvaluator; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 485a082e2..ef1c9c42c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); +#elif defined(__SYCL_DEVICE_ONLY__) + return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); #else uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); return (static_cast<uint32_t>(v >> 32) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d582ccbe1..dbe11c7af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, { typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; static const int NumDims = internal::array_size<Strides>::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -730,12 +736,22 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ + #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + #else + startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i])); + stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i])); + #endif }else{ - /* implies m_strides[i]<0 by assert */ + /* implies m_strides[i]<0 by assert */ + #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + #else + startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1)); + stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1)); + #endif } m_startIndices[i] = startIndicesClamped[i]; } @@ -796,13 +812,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, sizeof(Scalar)); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 319417687..82ca71215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -74,7 +74,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); @@ -108,9 +108,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is a work around for gcc bug + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef decltype(tuple_of_accessors) TupleType; + TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); typedef decltype(tmp_global_accessor) OutAccessor; cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), @@ -136,7 +137,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); typename Self::Index range, GRange, tileSize; typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; @@ -147,9 +148,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is work around for gcc bug. + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc; + Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 14e392e36..e430b0826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from sycl device + ReverseDimensions functor() const { return m_reverse; } + protected: Dimensions m_dimensions; array<Index, NumDims> m_strides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 6c35bfdb6..93615e5c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -117,11 +117,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { +#ifndef __SYCL_DEVICE_ONLY__ m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); +#else + m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); +#endif } const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -224,6 +228,13 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + Strides functor() const { return m_strides; } + + + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -250,6 +261,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; + const Strides m_strides; }; @@ -286,6 +298,12 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } + /// required by sycl in order to extract the accessor + const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } + /// required by sycl in order to extract the accessor + Strides functor() const { return this->m_strides; } + + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 113dd2557..29f362ade 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -97,8 +97,18 @@ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) -KERNELBROKERCONVERT(, false, TensorForcedEvalOp) +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ +template <typename Expr>\ +struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ + typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\ +}; +KERNELBROKERCONVERTFORCEDEVAL(const) +KERNELBROKERCONVERTFORCEDEVAL() +#undef KERNELBROKERCONVERTFORCEDEVAL + + + KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index df1a732e7..56ba82805 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN(const) ASSIGN() #undef ASSIGN + + + + + /// specialisation of the \ref ExprConstructor struct when the node type is + /// const TensorAssignOp + #define CONVERSIONEXPRCONST(CVQual)\ + template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\ + struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\ + typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\ + typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\ + my_nested_type nestedExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ + }; + + CONVERSIONEXPRCONST(const) + CONVERSIONEXPRCONST() + #undef CONVERSIONEXPRCONST + /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorEvalToOp /// 0 here is the output number in the buffer #define EVALTO(CVQual)\ @@ -212,10 +234,10 @@ EVALTO() /// TensorForcedEvalOp #define FORCEDEVAL(CVQual)\ template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\ +struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\ - TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ + typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ + TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ @@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorContractionOp +#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ +struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ +CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ + static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ + typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ + NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ + typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ + Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ +}; + +SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTION + + #define SYCLSLICEOPEXPR(CVQual)\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 876fcd45e..e4658eda5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -194,6 +194,23 @@ SYCLREDUCTIONEXTACC(const) SYCLREDUCTIONEXTACC() #undef SYCLREDUCTIONEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp +#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ +template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\ + struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ + -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ + return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ + }\ +}; + +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONEXTACC + + /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. This is a special case where there is no OP #define SYCLSLICEOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 6f9ab57af..e26cbdf6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{ }; +/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything +///TensorConversionOp +#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ +template <typename ArgType1, typename ArgType2, typename Dev>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\ + FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\ + : subExpr(expr.impl()) {}\ +}; + +SYCLEXTRFUNCCONVERSION(TensorConversionOp, const) +SYCLEXTRFUNCCONVERSION(TensorConversionOp, ) +#undef SYCLEXTRFUNCCONVERSION + #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\ template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\ struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\ @@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ +template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ + typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\ + typedef typename Evaluator::Dimensions Dimensions;\ + const Dimensions m_dimensions;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\ + : m_dimensions(expr.dimensions()) {}\ +}; + + +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) +#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP + /// specialisation of the \ref FunctorExtractor struct when the node type is /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCTSLICEOP(CVQual)\ @@ -253,9 +285,6 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ }; -// TensorContractionOp -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const) -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),) // TensorConcatenationOp SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 37fe196ea..0ac51e7bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const) REDUCTIONLEAFCOUNT() #undef REDUCTIONLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp +#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\ +template <typename Indices, typename LhsXprType, typename RhsXprType>\ +struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\ + static const size_t Count =1;\ +}; + +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) +#undef CONTRACTIONCONVOLUTIONLEAFCOUNT + + + /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 4419a1780..f6e3b4766 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -169,6 +169,20 @@ SYCLREDUCTION() /// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorReductionOp +#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ +template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\ +struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\ + typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\ +}; +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONPLH + + +/// specialisation of the \ref PlaceHolderExpression when the node is /// TensorCwiseSelectOp #define SLICEOPEXPR(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType, size_t N>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 32930be26..6ce41b0ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -49,19 +49,39 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. +/// std::array does not have TotalSize. So I have to get the size throgh template specialisation. +template<typename Index, typename Dimensions> struct DimensionSize{ + static Index getDimSize(const Dimensions& dim){ + return dim.TotalSize(); + + } +}; +#define DIMSIZEMACRO(CVQual)\ +template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\ + static inline Index getDimSize(const std::array<Index, NumDims>& dim){\ + return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\ + }\ +}; + +DIMSIZEMACRO(const) +DIMSIZEMACRO() +#undef DIMSIZEMACRO + + template <typename Expr, typename Dev> void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr; FunctorExpr functors = internal::extractFunctors(evaluator); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator - typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType; - TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator); + typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType; + TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator); typename Expr::Index range, GRange, tileSize; - dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange); + typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions())); + dev.parallel_for_setup(total_size, tileSize, range, GRange); cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index daedb671c..cbbd3efb4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,6 +152,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 0221da110..5dacc87f2 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -65,10 +65,71 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template<int DataLayout, typename Device> +void test_TF(const Device& sycl_device) +{ + Eigen::array<long, 2> left_dims = {{2, 3}}; + Eigen::array<long, 2> right_dims = {{3, 1}}; + Eigen::array<long, 2> res_dims = {{2, 1}}; + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + + + Tensor<float, 2, DataLayout, long> t_left(left_dims); + Tensor<float, 2, DataLayout, long> t_right(right_dims); + Tensor<float, 2, DataLayout, long> t_result_gpu(res_dims); + Tensor<float, 2, DataLayout, long> t_result(res_dims); + + t_left.data()[0] = 1.0f; + t_left.data()[1] = 2.0f; + t_left.data()[2] = 3.0f; + t_left.data()[3] = 4.0f; + t_left.data()[4] = 5.0f; + t_left.data()[5] = 6.0f; + + t_right.data()[0] = -1.0f; + t_right.data()[1] = 0.5f; + t_right.data()[2] = 2.0f; + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size()*sizeof(float); + + + float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); + float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); + float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); for (DenseIndex i = 0; i < t_result.size(); i++) { if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { @@ -84,9 +145,10 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); -} +} + template<int DataLayout, typename Device> void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) { @@ -121,9 +183,10 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f && !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { std::cout << "mismatch detected: " << t_result() @@ -204,6 +267,9 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s test_sycl_contraction_k<RowMajor>(sycl_device); test_sycl_contraction_sizes<ColMajor>(sycl_device); test_sycl_contraction_sizes<RowMajor>(sycl_device); + test_TF<RowMajor>(sycl_device); + test_TF<ColMajor>(sycl_device); + end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds = end-start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); @@ -211,6 +277,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s << "elapsed time: " << elapsed_seconds.count() << "s\n"; } + void test_cxx11_tensor_contract_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorContractionPerDevice(device)); diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp new file mode 100644 index 000000000..73b394c18 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -0,0 +1,221 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + + +template <typename DataType, int DataLayout> +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { + + int dim1 = 2; + int dim2 = 3; + int dim3 = 5; + int dim4 = 7; + + array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout> tensor(tensorRange); + Tensor<DataType, 4, DataLayout> reversed_tensor(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu(gpu_out_data, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + + + +template <typename DataType, int DataLayout> +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) +{ + int dim1 = 2; + int dim2 = 3; + int dim3 = 5; + int dim4 = 7; + + array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout> tensor(tensorRange); + Tensor<DataType, 4, DataLayout> expected(tensorRange); + Tensor<DataType, 4, DataLayout> result(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_expected(gpu_out_data_expected, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_result(gpu_out_data_result, tensorRange); + + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + + if (LValue) { + out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; + } else { + out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); + } + sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); + + + array<int, 4> src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array<int, 4> src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array<int, 4> dst_slice_dim = src_slice_dim; + array<int, 4> dst_slice_start = src_slice_start; + + for (int i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + for (int i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + + +template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){ + std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_reverse<DataType, RowMajor>(sycl_device); + test_simple_reverse<DataType, ColMajor>(sycl_device); + test_expr_reverse<DataType, RowMajor>(sycl_device, false); + test_expr_reverse<DataType, ColMajor>(sycl_device, false); + test_expr_reverse<DataType, RowMajor>(sycl_device, true); + test_expr_reverse<DataType, ColMajor>(sycl_device, true); +} +void test_cxx11_tensor_reverse_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_reverse_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp new file mode 100644 index 000000000..2cbb18f1c --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -0,0 +1,203 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include <iostream> +#include <chrono> +#include <ctime> + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_striding(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + //no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; +//Tensor<float, 4, DataLayout> stride; +// stride = tensor.stride(strides); + + gpu_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 1; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } + + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + //Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + +// Tensor<float, 4, DataLayout> result(3, 12, 10, 21); +// result.stride(strides) = tensor; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); + } + } + } + } + + array<IndexType, 4> no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; +// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21); +// result2.stride(strides) = tensor.stride(no_strides); + + gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); + } + } + } + } + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + + +template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_simple_striding<float, ColMajor, ptrdiff_t>(sycl_device); + test_simple_striding<float, RowMajor, ptrdiff_t>(sycl_device); + test_striding_as_lvalue<float, ColMajor, ptrdiff_t>(sycl_device); + test_striding_as_lvalue<float, RowMajor, ptrdiff_t>(sycl_device); +} + +void test_cxx11_tensor_striding_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorStridingPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index d5c0cbaad..5992a306d 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -229,6 +229,36 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } +template<typename Scalar1, typename Scalar2, int DataLayout> +static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ + int size = 20; + array<int, 1> tensorRange = {{size}}; + Tensor<Scalar1, 1, DataLayout> in(tensorRange); + Tensor<Scalar2, 1, DataLayout> out(tensorRange); + Tensor<Scalar2, 1, DataLayout> out_host(tensorRange); + + in = in.random(); + + Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); + Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); + + + + + TensorMap<Tensor<Scalar1, 1, DataLayout>> gpu_in(gpu_in_data, tensorRange); + TensorMap<Tensor<Scalar2, 1, DataLayout>> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); + gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); + out_host = in. template cast<Scalar2>(); + for(int i=0; i< size; i++) + { + VERIFY_IS_APPROX(out(i), out_host(i)); + } + printf("cast Test Passed\n"); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); @@ -238,6 +268,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_ test_sycl_mem_transfers<DataType, ColMajor>(sycl_device); test_sycl_computations<DataType, ColMajor>(sycl_device); test_sycl_mem_sync<DataType, ColMajor>(sycl_device); + test_sycl_cast<DataType, int, RowMajor>(sycl_device); + test_sycl_cast<DataType, int, ColMajor>(sycl_device); } void test_cxx11_tensor_sycl() { |