diff options
author | 2017-04-05 14:28:08 +0000 | |
---|---|---|
committer | 2017-04-05 14:28:08 +0000 | |
commit | 0d08165a7f7a95c35dead32ec2d567e9a4b609b0 (patch) | |
tree | ac7c8a2f553056f3f3da70592fa821cec8aa883d /unsupported/Eigen/CXX11 | |
parent | 4910630c968f9803bef5572bf7f17767c2ef09f1 (diff) | |
parent | 068cc0970890b534d65dbc99e6b5795acbaaa801 (diff) |
Merged in benoitsteiner/opencl (pull request PR-309)
OpenCL improvements
Diffstat (limited to 'unsupported/Eigen/CXX11')
25 files changed, 1209 insertions, 333 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index d06f40cd8..c0f33ba2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,6 +119,12 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { + return m_impl; + } +#endif + protected: TensorEvaluator<ArgType, Device> m_impl; }; @@ -172,7 +178,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, const ReduceOp& reduce_op, - const int return_dim, + const Index return_dim, const Dims& reduce_dims) : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} @@ -187,12 +193,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di const Dims& reduce_dims() const { return m_reduce_dims; } EIGEN_DEVICE_FUNC - int return_dim() const { return m_return_dim; } + Index return_dim() const { return m_return_dim; } protected: typename XprType::Nested m_xpr; const ReduceOp m_reduce_op; - const int m_return_dim; + const Index m_return_dim; const Dims m_reduce_dims; }; @@ -222,7 +228,11 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { + m_return_dim(op.return_dim()) +#ifdef EIGEN_USE_SYCL + ,m_device(device) +#endif + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast<int>(ColMajor)) { @@ -252,7 +262,16 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } + #ifndef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + #else // following functions are required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} + const Device& device() const{return m_device;} + #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -288,10 +307,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi protected: TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl; TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; +#ifdef EIGEN_USE_SYCL + const Device& m_device; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h new file mode 100644 index 000000000..8f76b8254 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorArgMaxSycl.h + * \brief: + * TensorArgMaxSycl + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +namespace Eigen { +namespace internal { + template<typename Dims, typename XprType> + struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense> + { + typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type; + }; + + template<typename Dims, typename XprType> + struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1, + typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type> + { + typedef TensorTupleReducerDeviceOp<Dims, XprType> type; + }; + +template<typename StrideDims, typename XprType> +struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + + +}// end namespace internal +template<typename StrideDims, typename XprType> +class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, + const Index return_dim, + const StrideDims strides, + const Index stride_mod, const Index stride_div) + :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + Index return_dim() const { return m_return_dim; } + + EIGEN_DEVICE_FUNC + const StrideDims& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const Index& stride_mod() const { return m_stride_mod; } + + EIGEN_DEVICE_FUNC + const Index& stride_div() const { return m_stride_div; } + + protected: + typename Eigen::internal::remove_all<typename + XprType::Nested + >::type m_xpr; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + + +// Eval as rvalue +template<typename StrideDims, typename ArgType> +struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice> +{ + typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Index Scalar; + typedef Index CoeffReturnType; + typedef typename XprType::CoeffReturnType TupleType; + typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) + : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), + m_stride_div(op.stride_div()){} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } +typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); } + +protected: + TensorEvaluator<ArgType , SyclKernelDevice> m_impl; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; +} // end namespace Eigen +#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe340820..5b1235826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const array<Index, NumDimensions>, const Derived> argmax() const { array<Index, NumDimensions> in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, NumDimensions>, @@ -632,7 +632,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const array<Index, NumDimensions>, const Derived> argmin() const { array<Index, NumDimensions> in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, NumDimensions>, @@ -643,7 +643,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const TensorTupleReducerOp< internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, 1>, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array<Index, 1> in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -656,7 +656,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const TensorTupleReducerOp< internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, 1>, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array<Index, 1> in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index e87de0c57..5b4c3c5bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -11,7 +11,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclConvertToDeviceExpression.h + * TensorTensorContractionsycl.h * * \brief: * TensorContractionsycl @@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { + if (data) { evalTo(data); return false; } else { @@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; + size_t out_offset; Index roundUpK, M, N, K; ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; @@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS Device dev; - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), + out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), @@ -230,13 +232,13 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS const Index nGroupId = itemID.get_group(1); // Work-group ID localCol const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + LhsScalar privateLhs; + RhsScalar privateRhs[WorkLoadPerThreadN]; + OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] = 0.0f; + privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0); } } @@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; if(globalCol<N) - out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; + out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN]; } } } @@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo // extract lhs functor list LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl()); Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); - + ptrdiff_t out_offset = self.device().get_offset(buffer); self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { /// work-around for gcc bug typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; @@ -379,18 +381,17 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor; //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - + OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer); // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, + localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 4247c1c4a..2e6021b1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -32,19 +32,20 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize, range_x, range_y; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -89,19 +90,20 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -156,21 +158,22 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } itemID.barrier(cl::sycl::access::fence_space::local_space); @@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr m_kernel = in_place; m_local_kernel = false; } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); Scalar* local = (Scalar*)m_device.allocate(kernel_sz); typedef TensorEvalToOp<const KernelArgType> EvalTo; EvalTo evalToTmp(local, m_kernelArg); @@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; // extract input functor list InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + ptrdiff_t out_offset = m_device.get_offset(data); m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { @@ -335,8 +339,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr // create input tuple of accessors InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data); typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); @@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -421,6 +425,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } }); + m_device.asynchronousExec(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..7e4c129bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,6 +140,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result( @@ -295,6 +299,10 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799bb..c5142b7c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,9 +15,22 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>> +struct SyclAllocator { + typedef Scalar value_type; + typedef typename std::allocator_traits<Allocator>::pointer pointer; + typedef typename std::allocator_traits<Allocator>::size_type size_type; + + SyclAllocator( ){}; + Scalar* allocate(std::size_t elements) { return static_cast<Scalar*>(aligned_alloc(Align, elements)); } + void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } +}; + namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) + #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) + template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { public: @@ -40,27 +53,50 @@ namespace Eigen { size_t m_offset; }; +template<typename AccType> struct memsetkernelFunctor{ - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; AccType m_acc; + const ptrdiff_t buff_offset; const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} void operator()(cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; + if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; } }; +struct memsetCghFunctor{ + cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf; + const ptrdiff_t& buff_offset; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c)); + } +}; + + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector<cl::sycl::device>::iterator it =devices.begin(); while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) - auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); + ///FIXME: Currently there is a bug in amd cpu OpenCL + auto name = (*it).template get_info<cl::sycl::info::device::name>(); + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + auto vendor = (*it).template get_info<cl::sycl::info::device::vendor>(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + + if((*it).is_cpu() && vendor.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs + it = devices.erase(it); + //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. + }else if((*it).is_gpu() && name.find("intel")!=std::string::npos){ + it = devices.erase(it); } else{ ++it; @@ -69,18 +105,8 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device return devices; } -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; +class QueueInterface { +public: /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s): @@ -116,11 +142,11 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); + std::lock_guard<std::mutex> lock(mutex_); + auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); buf.set_final_data(nullptr); - std::lock_guard<std::mutex> lock(mutex_); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); + buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf)); return static_cast<void*>(ptr); } @@ -138,62 +164,113 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard<std::mutex> lock(mutex_); buffer_map.clear(); } - - EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); + /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device + /// pointer created as a key we find the sycl buffer and get the host accessor with write mode + /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that + /// this buffer is accessed, the data will be copied to the device. + /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark + /// Also, this is faster as it uses the map_allocator instead of memcpy + template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + auto it =find_buffer(dst); + auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0)); + }); + synchronize(); } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - m_queue.wait_and_throw(); - } - return !exception_caught_; + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl + /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the + /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination + /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data + /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back + /// to the cpu only once per function call. + template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + auto it =find_buffer(src); + auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); } - // destructor - ~QueueInterface() { buffer_map.clear(); } -}; + /// the memcpy function + template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + auto it1 = find_buffer(static_cast<const void*>(src)); + auto it2 = find_buffer(dst); + auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; + auto i= (static_cast<const uint8_t*>(dst)) - it2->first; + offset/=sizeof(Index); + i/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); + }); + synchronize(); + } -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + size_t rng, GRange, tileSize; + parallel_for_setup(n, tileSize, rng, GRange); + auto it1 = find_buffer(static_cast<const void*>(data)); + ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first; + m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); + synchronize(); + } /// Creation of sycl accessor for a buffer. This function first tries to find /// the buffer in the buffer_map. If found it gets the accessor from it, if not, /// the function then adds an entry by creating a sycl buffer for that particular pointer. template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); + return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { + return find_buffer(ptr)->second; + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first); + } + + EIGEN_STRONG_INLINE void synchronize() const { + m_queue.wait_and_throw(); //pass + } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled + m_queue.wait_and_throw(); //pass } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); - auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); + tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); + auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); } rng = n; @@ -210,7 +287,7 @@ struct SyclDevice { template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); @@ -234,13 +311,11 @@ struct SyclDevice { } } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); @@ -273,6 +348,108 @@ struct SyclDevice { if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); } } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); + } + + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + m_queue.wait_and_throw(); + } + return !exception_caught_; + } + + // destructor + ~QueueInterface() { buffer_map.clear(); } + +private: + /// class members: + bool exception_caught_ = false; + + mutable std::mutex mutex_; + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map; + /// sycl queue + mutable cl::sycl::queue m_queue; + + EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const { + std::lock_guard<std::mutex> lock(mutex_); + auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } +}; + +// Here is a sycl deviuce struct which accept the sycl queue interface +// as an input +struct SyclDevice { + // class member. + QueueInterface* m_queue_stream; + /// QueueInterface is not owned. it is the caller's responsibility to destroy it. + explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} + + // get sycl accessor + template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { + return m_queue_stream->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); + + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -287,78 +464,27 @@ struct SyclDevice { /// the memcpy function template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; - auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + m_queue_stream->memcpy(dst,src,n); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return m_queue_stream->get_offset(ptr); + + } +// memcpyHostToDevice template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); - ::memcpy(host_acc.get_pointer(), src, n); + m_queue_stream->memcpyHostToDevice(dst,src,n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. +/// here is the memcpyDeviceToHost template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); + m_queue_stream->memcpyDeviceToHost(dst,src,n); } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); - synchronize(); + m_queue_stream->memset(data,c,n); } - - struct memsetCghFunctor{ - cl::sycl::buffer<uint8_t, 1>& m_buf; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); - } - }; + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME @@ -367,39 +493,32 @@ struct SyclDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on sycl devices. return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); - // return stream_->deviceProperties().multiProcessorCount; + return m_queue_stream->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); - - // return stream_->deviceProperties().maxThreadsPerBlock; + return m_queue_stream->maxSyclThreadsPerBlock(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + return m_queue_stream->maxSyclThreadsPerMultiProcessor(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); - // return stream_->deviceProperties().sharedMemPerBlock; + return m_queue_stream->sharedMemPerBlock(); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass + m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - + m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -407,8 +526,10 @@ struct SyclDevice { return m_queue_stream->ok(); } }; - - +// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. +// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator +// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation +struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d6415817b..8516b37b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -193,7 +193,12 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifndef __SYCL_DEVICE_ONLY__ return loadConstant(m_data+index); +#else + CoeffReturnType tmp = m_data[index]; + return tmp; +#endif } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index eb1d4934e..11ae21be9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -98,9 +98,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_generator(op.generator()) +#ifdef EIGEN_USE_SYCL + , m_argImpl(op.expression(), device) +#endif { - TensorEvaluator<ArgType, Device> impl(op.expression(), device); - m_dimensions = impl.dimensions(); + TensorEvaluator<ArgType, Device> argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { m_strides[0] = 1; @@ -155,6 +158,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } +#endif + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array<Index, NumDims>& coords) const { @@ -178,6 +186,9 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> Dimensions m_dimensions; array<Index, NumDims> m_strides; Generator m_generator; +#ifdef EIGEN_USE_SYCL + TensorEvaluator<ArgType, Device> m_argImpl; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed2..e1d5541bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -173,6 +173,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -241,6 +244,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warnig; + m_outputRows=0; //// silence the uninitialised warnig; } } eigen_assert(m_outputRows > 0); @@ -420,7 +425,11 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -501,6 +510,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Scalar m_paddingValue; TensorEvaluator<ArgType, Device> m_impl; + +#ifdef EIGEN_USE_SYCL + const XprType& m_op; +#endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9ee..af6ecf5f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -215,6 +215,11 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } +#endif + protected: Dimensions m_dimensions; array<Index, NumDims> m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d55..77c9c6c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -124,7 +124,9 @@ template <typename U, typename V> struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { + #ifndef __SYCL_DEVICE_ONLY__ if (&rhs == this) return *this; + #endif first = rhs.first; second = rhs.second; return *this; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f6..6e8e7885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -100,6 +100,9 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_patch_dims(op.patch_dims()) +#endif { Index num_patches = 1; const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -255,6 +258,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } +#endif + protected: Dimensions m_dimensions; array<Index, NumDims> m_outputStrides; @@ -262,6 +270,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> array<Index, NumDims-1> m_patchStrides; TensorEvaluator<ArgType, Device> m_impl; + +#ifdef EIGEN_USE_SYCL + const PatchDim m_patch_dims; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e9b..7356334e1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -421,7 +421,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, static const bool RunningFullReduction = (NumOutputDims==0); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) +#if defined(EIGEN_USE_SYCL) + , m_xpr_dims(op.dims()) +#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -675,13 +678,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, } EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} +#if defined(EIGEN_USE_SYCL) + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + const Device& device() const { return m_device; } + const Dims& xprDims() const { return m_xpr_dims; } +#endif private: template <int, typename, typename> friend struct internal::GenericDimReducer; @@ -791,7 +793,10 @@ static const bool RunningOnGPU = false; typename MakePointer_<CoeffReturnType>::Type m_result; const Device& m_device; - const Dims& m_xpr_dims; + +#if defined(EIGEN_USE_SYCL) + const Dims m_xpr_dims; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129e2..94899252b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,15 +27,15 @@ namespace internal { template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); + auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; @@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ template<typename BufferTOut, typename BufferTIn> -static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), - bufOut, bufI, dev, length, local); + bufOut, out_offset, bufI, dev, length, local); } }; @@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); + ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -157,11 +158,12 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); + auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output); + ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4c1..3d6270614 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -32,6 +32,8 @@ struct MakeLocalPointer { namespace Eigen { + template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp; + template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>; namespace TensorSycl { namespace internal { @@ -48,6 +50,13 @@ template<typename T> struct GetType<false, T>{ typedef T Type; }; +template <bool Conds, size_t X , size_t Y > struct ValueCondition { + static const size_t Res =X; +}; +template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { + static const size_t Res =Y; +}; + } } } @@ -80,6 +89,9 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +// TensorArgMaxSycl.h +#include "TensorArgMaxSycl.h" + /// this is used for extracting tensor convolution #include "TensorConvolutionSycl.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9c2..d6ac7b91f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -91,27 +91,37 @@ ASSIGNCONVERT(, false) #undef ASSIGNCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is either TensorForcedEvalOp or TensorEvalToOp +/// type is TensorEvalToOp #define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ + +KERNELBROKERCONVERT(const, true, TensorEvalToOp) +KERNELBROKERCONVERT(, false, TensorEvalToOp) +#undef KERNELBROKERCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ template <typename Expr>\ -struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\ +struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ + typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ }; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL +// TensorForcedEvalOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT +// TensorLayoutSwapOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) + +//TensorIndexTupleOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ @@ -124,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ +template <typename OP, typename Dim, typename subExpr>\ +struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\ + typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\ +}; + +KERNELBROKERCONVERTTUPLEREDUCTION(const) +KERNELBROKERCONVERTTUPLEREDUCTION() +#undef KERNELBROKERCONVERTTUPLEREDUCTION + +//TensorSlicingOp #define KERNELBROKERCONVERTSLICEOP(CVQual)\ template<typename StartIndices, typename Sizes, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ @@ -134,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const) KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP - +//TensorStridingSlicingOp #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ @@ -145,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const) KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ template <DenseIndex DimId, typename Expr>\ @@ -156,7 +177,26 @@ KERNELBROKERCONVERTCHIPPINGOP(const) KERNELBROKERCONVERTCHIPPINGOP() #undef KERNELBROKERCONVERTCHIPPINGOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp +#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ +}; +KERNELBROKERCONVERTIMAGEPATCHOP(const) +KERNELBROKERCONVERTIMAGEPATCHOP() +#undef KERNELBROKERCONVERTIMAGEPATCHOP + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp +#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\ + typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ +}; +KERNELBROKERCONVERTVOLUMEPATCHOP(const) +KERNELBROKERCONVERTVOLUMEPATCHOP() +#undef KERNELBROKERCONVERTVOLUMEPATCHOP } // namespace internal } // namespace TensorSycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d2c..24cc23f45 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -65,7 +65,6 @@ CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP @@ -83,6 +82,7 @@ CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Option ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ }; + TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE() #undef TENSORMAPFIXEDSIZE @@ -189,9 +189,6 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN() #undef ASSIGN - - - /// specialisation of the \ref ExprConstructor struct when the node type is /// const TensorAssignOp #define CONVERSIONEXPRCONST(CVQual)\ @@ -223,7 +220,7 @@ struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQua Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ + : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ }; EVALTO(const) @@ -236,8 +233,12 @@ EVALTO() template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ - TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ + typedef TensorForcedEvalOp<OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ @@ -248,19 +249,32 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL -template <bool Conds, size_t X , size_t Y > struct ValueCondition { - static const size_t Res =X; -}; -template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { - static const size_t Res =Y; +#define TENSORCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\ + typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; +TENSORCUSTOMUNARYOP(const) +TENSORCUSTOMUNARYOP() +#undef TENSORCUSTOMUNARYOP + /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp #define SYCLREDUCTIONEXPR(CVQual)\ template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\ CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\ - static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ + static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\ NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ Type expr;\ @@ -273,32 +287,67 @@ SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp +/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. +#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ +template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\ + static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\ + static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\ +static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\ + typedef CVQual TensorMap<\ + Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\ + Layout,\ + MakeGlobalPointer\ + > XprType;\ + typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\ + static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\ + typedef array<Index, NumDims> StrideDims;\ + typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\ + fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ + }\ +}; + +SYCLTUPLEREDUCTIONEXPR(const) +SYCLTUPLEREDUCTIONEXPR() +#undef SYCLTUPLEREDUCTIONEXPR /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp +#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ - typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ - NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ - typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ - Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ + typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\ + static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\ + typedef CVQual TensorMap<\ + Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\ + Eigen::internal::traits<XprTyp>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - - - +//TensorContractionOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +//TensorConvolutionOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +//TensorCustomBinaryOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) +#undef SYCLCONTRACTCONVCUSBIOPS + +//TensorSlicingOp #define SYCLSLICEOPEXPR(CVQual)\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ @@ -315,7 +364,7 @@ SYCLSLICEOPEXPR(const) SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR - +//TensorStridingSlicingOp #define SYCLSLICESTRIDEOPEXPR(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ @@ -332,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const) SYCLSLICESTRIDEOPEXPR() #undef SYCLSLICESTRIDEOPEXPR +//TensorReshapingOp and TensorShufflingOp #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -344,13 +394,15 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ }; +// TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) - +// TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST +//TensorPaddingOp #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -363,11 +415,11 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ }; +//TensorPaddingOp SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) #undef SYCLPADDINGOPEXPRCONST - // TensorChippingOp #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ @@ -385,6 +437,67 @@ SYCLTENSORCHIPPINGOPEXPR(const) SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR +// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value, funcD.m_padding_type, funcD.m_padding_explicit){}\ +}; + +SYCLTENSORIMAGEPATCHOPEXPR(const) +SYCLTENSORIMAGEPATCHOPEXPR() +#undef SYCLTENSORIMAGEPATCHOPEXPR + +// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value,\ + funcD.m_padding_type, funcD.m_padding_explicit){\ + }\ +}; + +SYCLTENSORVOLUMEPATCHOPEXPR(const) +SYCLTENSORVOLUMEPATCHOPEXPR() +#undef SYCLTENSORVOLUMEPATCHOPEXPR + +// TensorLayoutSwapOp and TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ +template<typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ +}; + +//TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) + +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR /// template deduction for \ref ExprConstructor struct template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43f6..fb95af59e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -147,6 +147,30 @@ SYCLFORCEDEVALEXTACC(const) SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC +//TensorCustomUnaryOp +#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, typename Dev >\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + + +SYCLCUSTOMUNARYOPEXTACC(const) +SYCLCUSTOMUNARYOPEXTACC() +#undef SYCLCUSTOMUNARYOPEXTACC + +//TensorCustomBinaryOp +#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + +SYCLCUSTOMBINARYOPEXTACC(const) +SYCLCUSTOMBINARYOPEXTACC() +#undef SYCLCUSTOMBIBARYOPEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ @@ -161,15 +185,19 @@ SYCLEVALTOEXTACC() #undef SYCLEVALTOEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ +#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; +// TensorReductionOp +SYCLREDUCTIONEXTACC(const,TensorReductionOp) +SYCLREDUCTIONEXTACC(,TensorReductionOp) -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() +// TensorTupleReducerOp +SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) +SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) #undef SYCLREDUCTIONEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp @@ -179,14 +207,14 @@ template<typename Indices, typename LhsXprType, typename RhsXprType, typename De static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; - +//TensorContractionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) #undef SYCLCONTRACTIONCONVOLUTIONEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ @@ -225,6 +253,49 @@ SYCLTENSORCHIPPINGOPEXTACC(const) SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorImagePatchOp. +#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORIMAGEPATCHOPEXTACC(const) +SYCLTENSORIMAGEPATCHOPEXTACC() +#undef SYCLTENSORIMAGEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorVolumePatchOp. +#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORVOLUMEPATCHOPEXTACC(const) +SYCLTENSORVOLUMEPATCHOPEXTACC() +#undef SYCLTENSORVOLUMEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorLayoutSwapOp, TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ +template<typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +// TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) + +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC /// template deduction for \ref ExtractAccessor template <typename Evaluator> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee020184b..a7905706d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -33,15 +33,17 @@ namespace internal { /// re-instantiate them on the device. /// We have to pass instantiated functors to the device. // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). -template <typename Evaluator> struct FunctorExtractor{ - typedef typename Evaluator::Dimensions Dimensions; - const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - FunctorExtractor(const Evaluator& expr) - : m_dimensions(expr.dimensions()) {} +#define DEFALTACTION(Evaluator)\ +typedef typename Evaluator::Dimensions Dimensions;\ +const Dimensions m_dimensions;\ +EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ +FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} +template <typename Evaluator> struct FunctorExtractor{ + DEFALTACTION(Evaluator) }; + /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything ///TensorConversionOp #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ @@ -113,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const) SYCLEXTRFUNCTERNARY() #undef SYCLEXTRFUNCTERNARY + + +//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different +//from the UnaryCategory and it is similar to the general FunctorExtractor. +/// specialisation of TensorCustomOp +#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename ArgType, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; +//TensorCustomUnaryOp +SYCLEXTRFUNCCUSTOMUNARYOP(const) +SYCLEXTRFUNCCUSTOMUNARYOP() +#undef SYCLEXTRFUNCCUSTOMUNARYOP + +//TensorCustomBinaryOp +#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ +template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; +//TensorCustomBinaryOp +SYCLEXTRFUNCCUSTOMBIBARYOP(const) +SYCLEXTRFUNCCUSTOMBIBARYOP() +#undef SYCLEXTRFUNCCUSTOMBIBARYOP + + + /// specialisation of the \ref FunctorExtractor struct when the node type is /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCSELECTOP(CVQual)\ @@ -143,19 +175,26 @@ SYCLEXTRFUNCASSIGNOP(const) SYCLEXTRFUNCASSIGNOP() #undef SYCLEXTRFUNCASSIGNOP -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorEvalToOp, This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOP(CVQual)\ -template <typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\ - : rhsExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node types are +/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ +template <typename Expr, typename Dev>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ + FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\ + : xprExpr(expr.impl()) {}\ }; - -SYCLEXTRFUNCEVALTOOP(const) -SYCLEXTRFUNCEVALTOOP() -#undef SYCLEXTRFUNCEVALTOOP +//TensorEvalToOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) +// TensorLayoutSwapOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) +// TensorIndexTupleOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) + +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE template<typename Dim, size_t NumOutputDim> struct DimConstr { template<typename InDim> @@ -166,10 +205,10 @@ template<typename Dim> struct DimConstr<Dim, 0> { template<typename InDim> static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} }; - +//TensorReductionOp #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\ typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ @@ -177,12 +216,39 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgTy FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ }; - - SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +//TensorTupleReducerOp +#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ +template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\ + struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\ + typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\ + static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\ + typedef typename Evaluator::StrideDims StrideDims;\ + typedef typename Evaluator::Index Index;\ + typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ + const Dimensions m_dimensions;\ + const Index m_return_dim;\ + const StrideDims m_strides;\ + const Index m_stride_mod;\ + const Index m_stride_div;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\ + FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\ + : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ + m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ +}; + +SYCLEXTRFUNCTUPLEREDUCTIONOP(const) +SYCLEXTRFUNCTUPLEREDUCTIONOP() +#undef SYCLEXTRFUNCTUPLEREDUCTIONOP + +//TensorContractionOp and TensorConvolutionOp #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ @@ -194,9 +260,10 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, Rhs : m_dimensions(expr.dimensions()) {}\ }; - +//TensorContractionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +//TensorConvolutionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP @@ -219,6 +286,7 @@ SYCLEXTRFUNCTSLICEOP(const) SYCLEXTRFUNCTSLICEOP() #undef SYCLEXTRFUNCTSLICEOP +//TensorStridingSlicingOp #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ @@ -237,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const) SYCLEXTRFUNCTSLICESTRIDEOP() #undef SYCLEXTRFUNCTSLICESTRIDEOP -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template<typename Param, typename XprType, typename Dev>\ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ @@ -248,9 +316,11 @@ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprTy : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ }; +//TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) +//TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT @@ -293,7 +363,7 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) //TensorChippingOp #define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ template<DenseIndex DimId, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\ FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ const DenseIndex m_dim;\ const DenseIndex m_offset;\ @@ -307,6 +377,84 @@ SYCLEXTRFUNCCHIPPINGOP(const) SYCLEXTRFUNCCHIPPINGOP() #undef SYCLEXTRFUNCCHIPPINGOP +//TensorImagePatchOp +#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\ +typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\ +FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ +: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\ + m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\ + m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\ + m_padding_value(expr.xpr().padding_value()){}\ +}; + +SYCLEXTRFUNCIMAGEPATCHOP(const) +SYCLEXTRFUNCIMAGEPATCHOP() +#undef SYCLEXTRFUNCIMAGEPATCHOP + +/// TensorVolumePatchOp +#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\ +typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\ +FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ +const DenseIndex m_patch_planes;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_plane_strides;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_plane_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_plane_inflate_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top_z;\ +const DenseIndex m_padding_bottom_z;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ +: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\ + m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\ + m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \ + m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\ +}; +SYCLEXTRFUNCVOLUMEPATCHOP(const) +SYCLEXTRFUNCVOLUMEPATCHOP() +#undef SYCLEXTRFUNCVOLUMEPATCHOP + + /// template deduction function for FunctorExtractor template <typename Evaluator> auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 2f7779036..e5b892f2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -21,11 +21,12 @@ namespace internal { template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ OP op; OutputAccessor aOut; + ptrdiff_t out_offset; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -59,7 +60,7 @@ namespace internal { aI[itemID.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; + aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; } } } @@ -71,9 +72,9 @@ namespace internal { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -84,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum; + output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -109,11 +111,11 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -124,8 +126,8 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; + output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -170,7 +173,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); @@ -217,7 +220,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); auto scale = (rng*red_factor) + remaining; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a1c112f4d..234580c7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -93,26 +93,58 @@ SYCLFORCEDEVALLEAFCOUNT(const) SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT +#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ +template <typename CustomUnaryFunc, typename XprType>\ +struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\ +static const size_t Count =1;\ +}; + +SYCLCUSTOMUNARYOPLEAFCOUNT(const) +SYCLCUSTOMUNARYOPLEAFCOUNT() +#undef SYCLCUSTOMUNARYOPLEAFCOUNT + + +#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\ +struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\ +static const size_t Count =1;\ +}; +SYCLCUSTOMBINARYOPLEAFCOUNT( const) +SYCLCUSTOMBINARYOPLEAFCOUNT() +#undef SYCLCUSTOMBINARYOPLEAFCOUNT + /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLEAFCOUNT(CVQual)\ +#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ template <typename Expr>\ -struct LeafCount<CVQual TensorEvalToOp<Expr> > {\ - static const size_t Count = 1 + CategoryCount<Expr>::Count;\ +struct LeafCount<CVQual ExprNode<Expr> > {\ + static const size_t Count = Num + CategoryCount<Expr>::Count;\ }; -EVALTOLEAFCOUNT(const) -EVALTOLEAFCOUNT() -#undef EVALTOLEAFCOUNT +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) + +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) + +#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ +#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr>\ -struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\ +struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\ static const size_t Count =1;\ }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() +// TensorReductionOp +REDUCTIONLEAFCOUNT(const,TensorReductionOp) +REDUCTIONLEAFCOUNT(,TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) +REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) + #undef REDUCTIONLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp @@ -128,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) #undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType>\ @@ -139,7 +169,6 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp #define CHIPPINGOPLEAFCOUNT(CVQual)\ template <DenseIndex DimId, typename XprType>\ @@ -149,7 +178,7 @@ CHIPPINGOPLEAFCOUNT(const) CHIPPINGOPLEAFCOUNT() #undef CHIPPINGOPLEAFCOUNT - +///TensorStridingSlicingOp #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{}; @@ -158,6 +187,24 @@ SLICESTRIDEOPLEAFCOUNT(const) SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT +//TensorImagePatchOp +#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{}; + + +TENSORIMAGEPATCHOPLEAFCOUNT(const) +TENSORIMAGEPATCHOPLEAFCOUNT() +#undef TENSORIMAGEPATCHOPLEAFCOUNT + +// TensorVolumePatchOp +#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{}; + +TENSORVOLUMEPATCHOPLEAFCOUNT(const) +TENSORVOLUMEPATCHOPLEAFCOUNT() +#undef TENSORVOLUMEPATCHOPLEAFCOUNT } /// namespace TensorSycl } /// namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 74566dcee..9d5708fc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -143,17 +143,52 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMUNARYOPEVAL(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\ +}; + +CUSTOMUNARYOPEVAL(const) +CUSTOMUNARYOPEVAL() +#undef CUSTOMUNARYOPEVAL + + /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp -#define EVALTO(CVQual)\ +/// TensorForcedEvalOp +#define CUSTOMBINARYOPEVAL(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\ +}; + +CUSTOMBINARYOPEVAL(const) +CUSTOMBINARYOPEVAL() +#undef CUSTOMBINARYOPEVAL + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp +#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ template <typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\ - typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\ +struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\ + typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\ }; -EVALTO(const) -EVALTO() -#undef EVALTO +// TensorEvalToOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) +//TensorLayoutSwapOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) +//TensorIndexTupleOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) + +#undef EVALTOLAYOUTSWAPINDEXTUPLE /// specialisation of the \ref PlaceHolderExpression when the node is @@ -169,17 +204,24 @@ CHIPPINGOP() #undef CHIPPINGOP /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLREDUCTION(CVQual)\ +/// TensorReductionOp and TensorTupleReducerOp (Argmax) +#define SYCLREDUCTION(CVQual, ExprNode)\ template <typename OP, typename Dims, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\ - typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\ +struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\ + typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\ }; -SYCLREDUCTION(const) -SYCLREDUCTION() + +// tensor reduction +SYCLREDUCTION(const, TensorReductionOp) +SYCLREDUCTION(, TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +SYCLREDUCTION(const, TensorTupleReducerOp) +SYCLREDUCTION(, TensorTupleReducerOp) #undef SYCLREDUCTION + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ @@ -218,6 +260,34 @@ SYCLSLICESTRIDEOPPLH() #undef SYCLSLICESTRIDEOPPLH + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ +}; + +SYCLTENSORIMAGEPATCHOP(const) +SYCLTENSORIMAGEPATCHOP() +#undef SYCLTENSORIMAGEPATCHOP + + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\ + typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ +}; + +SYCLTENSORVOLUMEPATCHOP(const) +SYCLTENSORVOLUMEPATCHOP() +#undef SYCLTENSORVOLUMEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template <typename Expr> struct createPlaceHolderExpression { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac785540..29c78184d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,7 +25,6 @@ namespace Eigen { namespace TensorSycl { - template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; @@ -38,7 +37,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx void operator()(cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); if (gId < range) device_evaluator.evalScalar(gId); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 0ca2cac84..b0f970ae6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -183,9 +183,16 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D CoordAccess = false, RawAccess = false }; +#ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -322,6 +329,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D // Fast representations of different variables. m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride); @@ -338,7 +346,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -506,6 +513,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif + Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -600,6 +611,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D Scalar m_paddingValue; TensorEvaluator<ArgType, Device> m_impl; + +#ifdef EIGEN_USE_SYCL + XprType m_op; +#endif }; |