diff options
Diffstat (limited to 'unsupported/Eigen')
45 files changed, 1963 insertions, 939 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 6f7c6d86d..05e7963f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType> template<typename XprType> struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense> { - typedef const TensorIndexTupleOp<XprType>& type; + typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type; }; template<typename XprType> @@ -82,6 +82,8 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; static const int NumDims = internal::array_size<Dimensions>::value; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, @@ -100,7 +102,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -118,11 +120,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { - return m_impl; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } #endif @@ -154,7 +156,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp template<typename ReduceOp, typename Dims, typename XprType> struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense> { - typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type; + typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type; }; template<typename ReduceOp, typename Dims, typename XprType> @@ -216,6 +218,9 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions; static const int NumDims = internal::array_size<InputDimensions>::value; typedef array<Index, NumDims> StrideDims; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory<TupleType, Device> TupleStorageMem; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, @@ -231,9 +236,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), m_return_dim(op.return_dim()) -#ifdef EIGEN_USE_SYCL - ,m_device(device) -#endif { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast<int>(ColMajor)) { @@ -242,15 +244,18 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi } else { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = (m_return_dim >= 0) ? m_strides[m_return_dim] : 1; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = ((m_return_dim >= 0) && + (m_return_dim < static_cast<Index>(m_strides.size()))) + ? m_strides[m_return_dim] : 1; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -263,16 +268,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } - #ifndef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - #else // following functions are required by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} - const Device& device() const{return m_device;} - #endif + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_orig_impl.bind(cgh); + } +#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -312,9 +314,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; -#ifdef EIGEN_USE_SYCL - const Device& m_device; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 06bf422c5..d6e51bc6c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -97,6 +97,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; static const int NumDims = XprType::NumDims; @@ -136,7 +138,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -154,6 +156,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i)); @@ -199,13 +202,15 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> m_leftImpl.writeBlock(*block); } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } +#endif - /// required by sycl in order to extract the accessor - const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator<LeftArgType, Device> m_leftImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 0db637405..38c06aba2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -841,7 +841,7 @@ struct TensorBlockView { const Scalar* data() const { return m_data; } private: - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_block_sizes, m_block_strides; const Scalar* m_data; // Not owned. Scalar* m_allocated_data; // Owned. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index c102a43fb..10bdbc6a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -37,7 +37,7 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType template<typename Broadcast, typename XprType> struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> { - typedef const TensorBroadcastingOp<Broadcast, XprType>& type; + typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type; }; template<typename Broadcast, typename XprType> @@ -105,7 +105,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; bool isCopy, nByOne, oneByN; + public: + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -205,7 +209,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -238,6 +242,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // TODO: attempt to speed this up. The integer divisions and modulo are slow EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) { @@ -272,6 +277,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) { @@ -376,6 +382,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> values[0] = m_impl.coeff(inputIndex); return internal::pload1<PacketReturnType>(values); } else { + EIGEN_UNROLL_LOOP for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[endDim]) { values[i] = m_impl.coeff(inputIndex); @@ -410,6 +417,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> return m_impl.template packet<Unaligned>(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { if (inputIndex > m_inputStrides[dim]-1) { inputIndex = 0; @@ -441,6 +449,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> values[0] = m_impl.coeff(inputIndex); return internal::pload1<PacketReturnType>(values); } else { + EIGEN_UNROLL_LOOP for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[dim]) { values[i] = m_impl.coeff(inputIndex); @@ -465,6 +474,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) { @@ -500,6 +510,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[0]) { values[i] = m_impl.coeff(inputIndex+i); @@ -521,6 +532,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq<Broadcast>(i, 1)) { @@ -556,6 +568,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { values[i] = m_impl.coeff(inputIndex+i); @@ -572,6 +585,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost<Index>(); if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost<Index>(); if (internal::index_statically_eq<Broadcast>(i, 1)) { @@ -845,12 +859,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } - + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( const Dimensions& input_block_sizes, @@ -874,9 +893,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); } - protected: - const Device& m_device; - const Broadcast m_broadcast; +protected: + const Device EIGEN_DEVICE_REF m_device; + const typename internal::remove_reference<Broadcast>::type m_broadcast; Dimensions m_dimensions; array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 0b5d4127b..7afaf0f33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -38,7 +38,7 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> template<DenseIndex DimId, typename XprType> struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> { - typedef const TensorChippingOp<DimId, XprType>& type; + typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type; }; template<DenseIndex DimId, typename XprType> @@ -139,7 +139,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -169,7 +170,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -218,7 +219,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -243,6 +244,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; @@ -262,6 +264,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } else { // Cross the stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; @@ -349,26 +352,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { - CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (IsOuterChipping && result) { return result + m_inputOffset; } else { return NULL; } } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { - return m_dim.actualDim(); +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { - return m_offset; - } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const @@ -399,10 +396,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> DSizes<Index, NumInputDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; const internal::DimensionId<DimId> m_dim; - const Device& m_device; -// required by sycl - const DenseIndex m_offset; - + const Device EIGEN_DEVICE_REF m_device; }; @@ -466,6 +460,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; @@ -484,6 +479,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> // Cross stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 3863ee8c3..292a1bae1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -119,6 +119,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, @@ -181,7 +183,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); @@ -219,11 +221,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy Index left_index; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { left_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } } else { left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } @@ -235,11 +239,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy Index right_index; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { right_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } } else { right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } @@ -257,6 +263,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); } @@ -279,13 +286,15 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy TensorOpCost(0, 0, compute_cost); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - const Axis& axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 4ddcd982e..de7c2248a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -433,6 +433,8 @@ struct TensorContractionEvaluatorBase typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -452,6 +454,9 @@ struct TensorContractionEvaluatorBase static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType; static const int LDims = internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; @@ -653,14 +658,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { evalTo(data); return false; } else { - m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } @@ -934,7 +939,7 @@ struct TensorContractionEvaluatorBase return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) { @@ -1169,9 +1174,9 @@ protected: TensorEvaluator<EvalLeftArgType, Device> m_leftImpl; TensorEvaluator<EvalRightArgType, Device> m_rightImpl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; OutputKernelType m_output_kernel; - Scalar* m_result; + EvaluatorPointerType m_result; bool m_can_use_xsmm; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 1be823fd1..50865d404 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -59,6 +59,13 @@ struct CoeffLoader { return m_tensor.template packet<LoadMode>(index); } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + private: const Tensor m_tensor; }; @@ -87,6 +94,13 @@ struct CoeffLoader<Tensor, true, MakePointer_> { { return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index); } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } + #endif private: typedef typename Tensor::Scalar Scalar; @@ -139,6 +153,7 @@ class SimpleTensorContractionMapper { EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; + EIGEN_UNROLL_LOOP for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { const Index idx = nocontract_val / m_ij_strides[i]; linidx += idx * m_nocontract_strides[i]; @@ -155,6 +170,7 @@ class SimpleTensorContractionMapper { Index contract_val = left ? col : row; if(array_size<contract_t>::value > 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { const Index idx = contract_val / m_k_strides[i]; linidx += idx * m_contract_strides[i]; @@ -179,6 +195,7 @@ class SimpleTensorContractionMapper { Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + EIGEN_UNROLL_LOOP for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { const Index idx0 = nocontract_val[0] / m_ij_strides[i]; const Index idx1 = nocontract_val[1] / m_ij_strides[i]; @@ -199,6 +216,7 @@ class SimpleTensorContractionMapper { Index contract_val[2] = {left ? col : row, left ? col : row + distance}; if (array_size<contract_t>::value> 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { const Index idx0 = contract_val[0] / m_k_strides[i]; const Index idx1 = contract_val[1] / m_k_strides[i]; @@ -230,6 +248,13 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const { return m_tensor; } @@ -302,6 +327,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, EIGEN_ALIGN_MAX Scalar data[packet_size]; data[0] = this->m_tensor.coeff(first); + EIGEN_UNROLL_LOOP for (Index k = 1; k < packet_size - 1; k += 2) { const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); data[k] = this->m_tensor.coeff(internal_pair.first); @@ -472,6 +498,13 @@ class TensorContractionSubMapper { return false; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_base_mapper.bind(cgh); + } + #endif + const ParentMapper& base_mapper() const { return m_base_mapper; } Index vert_offset() const { return m_vert_offset; } Index horiz_offset() const { return m_horiz_offset; } @@ -515,6 +548,22 @@ class TensorContractionInputMapper }; +template <typename T> struct TensorContractionInputMapperTrait; + +template<typename Scalar_, typename Index_, int side_, + typename Tensor_, + typename nocontract_t_, typename contract_t_, + int packet_size_, + bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_, template <class> class MakePointer_> +struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, + nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, + inner_dim_reordered_, Alignment_, MakePointer_> > { + + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; + }; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 938fd0f34..e96f31537 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -129,6 +129,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> { typedef typename internal::unpacket_traits<TgtPacket>::type TgtType; internal::scalar_cast_op<SrcType, TgtType> converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < TgtPacketSize; ++i) { values[i] = converter(m_impl.coeff(index+i)); } @@ -164,15 +165,15 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT typename XprType::Nested m_xpr; }; -template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { +template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; -template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { +template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; @@ -207,6 +208,7 @@ struct PacketConv { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) { internal::scalar_cast_op<SrcType, TargetType> converter; EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = converter(impl.coeff(index+i)); } @@ -267,10 +269,18 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> typedef typename PacketType<SrcType, Device>::type PacketSourceType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; static const bool IsSameType = internal::is_same<TargetType, SrcType>::value; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = true, + PacketAccess = + #ifndef EIGEN_USE_SYCL + true, + #else + TensorEvaluator<ArgType, Device>::PacketAccess & + internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, + #endif BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, @@ -284,9 +294,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data); + return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() @@ -330,10 +340,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: TensorEvaluator<ArgType, Device> m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 2d0e6599f..25e1e5896 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -303,6 +303,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, @@ -469,7 +471,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr PacketSize)); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -525,7 +527,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr m_local_kernel = false; } else { size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); typedef TensorEvalToOp<const KernelArgType> EvalTo; EvalTo evalToTmp(local, m_kernelArg); const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value; @@ -548,7 +550,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr KernelArgType m_kernelArg; const Scalar* m_kernel; bool m_local_kernel; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 6ee3827f3..723d2b082 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -36,7 +36,7 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > template<typename CustomUnaryFunc, typename XprType> struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense> { - typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type; + typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type; }; template<typename CustomUnaryFunc, typename XprType> @@ -88,7 +88,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename PointerType<CoeffReturnType, Device>::Type PointerT; + typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -108,20 +110,20 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast<PointerT>( - m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); evalTo(m_result); return true; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { + if (m_result) { m_device.deallocate_temp(m_result); m_result = NULL; } @@ -141,22 +143,25 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC PointerT data() const { return m_result; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } #endif protected: - EIGEN_DEVICE_FUNC void evalTo(PointerT data) { - TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(data, m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } Dimensions m_dimensions; const ArgType m_op; - const Device& m_device; - PointerT m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; @@ -251,7 +256,10 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - typedef typename PointerType<CoeffReturnType, Device>::Type PointerT; + + typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -271,12 +279,13 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast<PointerT>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))); + m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); evalTo(m_result); return true; } @@ -303,22 +312,25 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC PointerT data() const { return m_result; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } #endif protected: - EIGEN_DEVICE_FUNC void evalTo(PointerT data) { - TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(data, m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } Dimensions m_dimensions; const XprType m_op; - const Device& m_device; - PointerT m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 8cb95f731..46b9d3ab2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -39,6 +39,10 @@ struct DefaultDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template<typename Type> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { #if !defined(EIGEN_GPU_COMPILE_PHASE) @@ -54,7 +58,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running on the host CPU return l1CacheSize(); #elif defined(EIGEN_HIP_DEVICE_COMPILE) @@ -67,7 +71,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running single threaded on the host CPU return l3CacheSize(); #elif defined(EIGEN_HIP_DEVICE_COMPILE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 83cde6afb..ebf85c072 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -215,6 +215,10 @@ struct GpuDevice { stream_->deallocate(buffer); } + template<typename Type> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e7beb2c82..93efe2f82 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,498 +14,879 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -template<size_t Align> struct CheckAlignStatically { - static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); -}; -template <bool IsAligned, size_t Align> -struct Conditional_Allocate { +#include <unordered_set> - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) { - return aligned_alloc(Align, elements); - } -}; -template <size_t Align> -struct Conditional_Allocate<false, Align> { +namespace Eigen { - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ - return malloc(elements); - } +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info<cl::sycl::info::device::local_mem_type>()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info<cl::sycl::info::device::local_mem_size>()), + platform_name(queue.get_device() + .get_platform() + .template get_info<cl::sycl::info::platform::name>()), + device_name(queue.get_device() + .template get_info<cl::sycl::info::device::name>()), + device_vendor( + queue.get_device() + .template get_info<cl::sycl::info::device::vendor>()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; }; -template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>> -struct SyclAllocator { - typedef Scalar value_type; - typedef typename std::allocator_traits<Allocator>::pointer pointer; - typedef typename std::allocator_traits<Allocator>::size_type size_type; - SyclAllocator( ){}; - Scalar* allocate(std::size_t elements) { - return static_cast<Scalar*>(Conditional_Allocate<CheckAlignStatically<Align>::Val, Align>::conditional_allocate(elements)); +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector<cl::sycl::device> supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info<cl::sycl::info::platform::name>(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } } - void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } -}; - -namespace Eigen { + return supported_devices; +#endif +} -#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast<Scalar*>(static_cast<void*>(((buf_acc.get_pointer().get())))) -#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template <typename DeviceOrSelector> + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type<DeviceOrSelector>(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + template <typename DeviceOrSelector> + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} - template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { - public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { + std::lock_guard<std::mutex> lock(pmapper_mutex_); + return static_cast<void *>(pMapper.add_pointer(buf)); + } - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - -template<typename AccType> - struct memsetkernelFunctor{ - AccType m_acc; - const ptrdiff_t buff_offset; - const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} - void operator()(cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; - } - - }; - -struct memsetCghFunctor{ - cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c)); + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard<std::mutex> lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree<false>(p, pMapper); } -}; -//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) -EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ -std::vector<cl::sycl::device> supported_devices; -auto plafrom_list =cl::sycl::platform::get_platforms(); -for(const auto& platform : plafrom_list){ - auto device_list = platform.get_devices(); - auto platform_name =platform.template get_info<cl::sycl::info::platform::name>(); - std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); - for(const auto& device : device_list){ - auto vendor = device.template get_info<cl::sycl::info::device::vendor>(); - std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) || - (device.is_gpu() && platform_name.find("intel")!=std::string::npos); - if(!unsuported_condition){ - std::cout << "Platform name "<< platform_name << std::endl; - supported_devices.push_back(device); + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; } +#endif + std::lock_guard<std::mutex> lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } -} -return supported_devices; -} -class QueueInterface { -public: - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s): -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard<std::mutex> lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; } - } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } - })) #else -m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - std::cerr << "Error detected Inside Sycl Device."<< std::endl; - - } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); +#endif + } + template <typename data_t> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data); } -})) + template <typename data_t> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, + data_t> + data) const { + return static_cast<data_t *>(data.get_virtual_pointer()); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard<std::mutex> lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); #endif - {} - - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. - /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key - /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we - /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. - /// The device pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - std::lock_guard<std::mutex> lock(mutex_); - auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); - buf.set_final_data(nullptr); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf)); - return static_cast<void*>(ptr); + } + template <cl::sycl::access::mode AcMd, typename T> + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess<AcMd, T> &p) const { + deallocate_temp(p.get_virtual_pointer()); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it = buffer_map.find(static_cast<const uint8_t*>(p)); - if (it != buffer_map.end()) { - buffer_map.erase(it); - } + std::lock_guard<std::mutex> lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); } EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard<std::mutex> lock(mutex_); - buffer_map.clear(); - } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with write mode - /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. - /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark - /// Also, this is faster as it uses the map_allocator instead of memcpy - template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto it =find_buffer(dst); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0)); - }); - synchronize(); - } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. - template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it =find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - } - - /// the memcpy function - template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = find_buffer(static_cast<const void*>(src)); - auto it2 = find_buffer(dst); - auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; - auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + std::lock_guard<std::mutex> lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif } + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function<void()> callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access> + write_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function<void()> callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access> + read_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n); + buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor<read_mode>(cgh, src, n); + auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = find_buffer(static_cast<const void*>(data)); - ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first; - m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor<write_mode>(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template <cl::sycl::access::mode AcMd, typename T> + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard<std::mutex> lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const<T>::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast<size_t>(typed_offset), + reinterpret_cast<std::intptr_t>(start_ptr)); + } + + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template <cl::sycl::access::mode AcMd, typename Index> + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard<std::mutex> lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access<AcMd, global_access>( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. - template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template <cl::sycl::access::mode AcMd> + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard<std::mutex> lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access<AcMd, cl::sycl::access::target::global_buffer>( + cgh); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { - return find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( + const void *ptr) const { + std::lock_guard<std::mutex> lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first); + std::lock_guard<std::mutex> lock(pmapper_mutex_); + return pMapper.get_offset(ptr); } EIGEN_STRONG_INLINE void synchronize() const { - m_queue.wait_and_throw(); //pass +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif } - EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled - m_queue.wait_and_throw(); //pass + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif } - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); - auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); - } + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast<Index>(tileSize)); rng = n; - if (rng==0) rng=static_cast<Index>(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast<Index>(GRange % tileSize); + if (rng == 0) rng = static_cast<Index>(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast<Index>(GRange % tileSize); if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + Index max_workgroup_Size = + static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast<Index>(max_workgroup_Size)); Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); + tileSize1 = + static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast<Index>(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast<Index>(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); } - tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); + tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1); rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast<Index>(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast<Index>(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = + static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast<Index>(max_workgroup_Size)); Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast<Index>(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast<Index>(GRange2 % tileSize2); + tileSize2 = + static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3))); + rng2 = dim2; + if (rng2 == 0) rng1 = static_cast<Index>(1); + GRange2 = rng2; + if (tileSize2 > GRange2) + tileSize2 = GRange2; + else if (GRange2 > tileSize2) { + Index xMode = static_cast<Index>(GRange2 % tileSize2); if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); } - pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); + pow_of_2 = static_cast<Index>( + std::log2(static_cast<Index>(max_workgroup_Size / tileSize2))); + tileSize1 = + static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast<Index>(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast<Index>(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); } - tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); + tileSize0 = + static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2)); rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast<Index>(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast<Index>(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); } } + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); + return m_device_info.max_compute_units; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept + // OpenCL doesnot have such concept return 2; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); + return m_device_info.local_mem_size; } - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { if (!exception_caught_) { - m_queue.wait_and_throw(); + synchronize(); } return !exception_caught_; } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard<std::mutex> lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + // destructor - ~QueueInterface() { buffer_map.clear(); } + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard<std::mutex> lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function<void()> &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list l) const { + bool exception_caught = false; + for (const auto &e : l) { + if (e) { + exception_caught = true; +#ifdef EIGEN_EXCEPTIONS + try { + std::rethrow_exception(e); + } catch (const cl::sycl::exception &e) { + std::cerr << e.what() << std::endl; + } +#else + std::cerr << "Error detected inside Sycl device." << std::endl; + abort(); +#endif + } + } + return exception_caught; + } -private: /// class members: bool exception_caught_ = false; - mutable std::mutex mutex_; + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_; +#endif /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map; + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set<void *> scratch_buffers; +#endif /// sycl queue mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif - EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; } }; -// Here is a sycl deviuce struct which accept the sycl queue interface +// Here is a sycl device struct which accept the sycl queue interface // as an input -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template <cl::sycl::access::mode AcMd, typename T> + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T> + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor<AcMd, T>(ptr); + } // get sycl accessor - template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr); + template <cl::sycl::access::mode AcMd> + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->get_sycl_buffer(ptr); + EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, + rng1, GRange0, GRange1); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template <typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, + tileSize2, rng0, rng1, rng2, GRange0, + GRange1, GRange2); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); - - } - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return m_queue_stream->allocate(num_bytes); - } /// deallocate device memory EIGEN_STRONG_INLINE void deallocate(void *p) const { - m_queue_stream->deallocate(p); - } + queue_stream()->deallocate(p); + } - // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template <cl::sycl::access::mode AcMd, typename T> + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } - /// the memcpy function - template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpy(dst,src,n); + template <typename data_t> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template <typename data_t> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, + data_t> + data) const { + return queue_stream()->get(data); } + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer<buffer_scalar_t, 1> &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return m_queue_stream->get_offset(ptr); + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + /// memcpyHostToDevice + template <typename Index> + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function<void()> callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); } -// memcpyHostToDevice - template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyHostToDevice(dst,src,n); + /// memcpyDeviceToHost + template <typename Index> + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function<void()> callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); } -/// here is the memcpyDeviceToHost - template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyDeviceToHost(dst,src,n); + /// the memcpy function + template <typename Index> + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); } - /// Here is the implementation of memset function on sycl. + /// the memset function EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - m_queue_stream->memset(data,c,n); + queue_stream()->memset(data, c, n); } /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and @@ -513,40 +894,64 @@ struct SyclDevice { return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue_stream->getNumSyclMultiProcessors(); + return queue_stream()->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue_stream->maxSyclThreadsPerBlock(); + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept - return m_queue_stream->maxSyclThreadsPerMultiProcessor(); - // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue_stream->sharedMemPerBlock(); + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } EIGEN_STRONG_INLINE void synchronize() const { - m_queue_stream->synchronize(); //pass + queue_stream()->synchronize(); } - - EIGEN_STRONG_INLINE void asynchronousExec() const { - m_queue_stream->asynchronousExec(); + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); } + // This function checks if the runtime recorded an error for the // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - return m_queue_stream->ok(); + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); } }; -// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. -// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator -// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation -struct SyclKernelDevice:DefaultDevice{}; - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index b43db40c8..ef22a268a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -75,6 +75,11 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + + template<typename Type> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { #ifdef __ANDROID__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 910472ad8..60a07d6eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -42,7 +42,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; }; @@ -105,7 +104,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - + typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, @@ -124,22 +125,16 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> TensorBlockReader; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); @@ -191,19 +186,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } ArgType expression() const { return m_expression; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_buffer.bind(cgh); + } + #endif - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} private: TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; + EvaluatorPointerType m_buffer; const ArgType m_expression; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 7f0f4acbc..1d48b5eed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -34,6 +34,9 @@ struct TensorEvaluator typedef typename Derived::Dimensions Dimensions; typedef Derived XprType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? @@ -60,16 +63,17 @@ struct TensorEvaluator TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + : m_data(device.get((const_cast<TensorPointerType>(m.data())))), + m_dims(m.dimensions()), + m_device(device) { } - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; @@ -78,14 +82,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits<Derived>::template MakePointer<Scalar>::RefType - coeffRef(Index index) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); return m_data[index]; } @@ -114,7 +116,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -122,10 +124,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits<Derived>::template MakePointer<Scalar>::RefType + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<DenseIndex, NumCoords>& coords) { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -152,16 +153,18 @@ struct TensorEvaluator TensorBlockWriter::Run(block, m_data); } - EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; } - - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device m_device; }; namespace { @@ -184,6 +187,13 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template <cl::sycl::access::mode AcMd, typename T> +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) { + return *address; +} +#endif } @@ -197,7 +207,9 @@ struct TensorEvaluator<const Derived, Device> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; typedef const Derived XprType; - + typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType; + typedef StorageMemory<const Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? @@ -221,18 +233,15 @@ struct TensorEvaluator<const Derived, Device> typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> TensorBlockReader; - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; @@ -241,13 +250,8 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); -#ifndef __SYCL_DEVICE_ONLY__ + eigen_assert(m_data != NULL); return loadConstant(m_data+index); -#else - CoeffReturnType tmp = m_data[index]; - return tmp; -#endif } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -269,7 +273,7 @@ struct TensorEvaluator<const Derived, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -288,16 +292,17 @@ struct TensorEvaluator<const Derived, Device> TensorBlockReader::Run(block, m_data); } - EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; } - - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device m_device; }; @@ -310,16 +315,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> { typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType; - enum { - IsAligned = true, - PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() @@ -331,10 +326,26 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType<CoeffReturnType, Device>::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -354,13 +365,14 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> PacketType<CoeffReturnType, Device>::size); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif private: const NullaryOp m_functor; @@ -401,14 +413,15 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; - + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int NumDims = internal::array_size<Dimensions>::value; typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } @@ -456,16 +469,18 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> arg_block.data()); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif private: - const Device& m_device; + const Device m_device; const UnaryOp m_functor; TensorEvaluator<ArgType, Device> m_argImpl; }; @@ -509,6 +524,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int NumDims = internal::array_size< typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; @@ -524,7 +541,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg return m_leftImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); return true; @@ -576,16 +593,17 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg right_block.block_strides(), right_block.data()); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif private: - const Device& m_device; + const Device m_device; const BinaryOp m_functor; TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<RightArgType, Device> m_rightImpl; @@ -639,6 +657,8 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -646,7 +666,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, return m_arg1Impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_arg1Impl.evalSubExprsIfNeeded(NULL); m_arg2Impl.evalSubExprsIfNeeded(NULL); m_arg3Impl.evalSubExprsIfNeeded(NULL); @@ -679,14 +699,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif private: const TernaryOp m_functor; @@ -731,6 +753,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -738,7 +762,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -757,13 +781,15 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector<PacketSize> select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet<LoadMode>(index), - m_elseImpl.template packet<LoadMode>(index)); + internal::Selector<PacketSize> select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet<LoadMode>(index), + m_elseImpl.template packet<LoadMode>(index)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -773,14 +799,16 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif private: TensorEvaluator<IfArgType, Device> m_condImpl; TensorEvaluator<ThenArgType, Device> m_thenImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 647c98d4e..f1ae548f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -442,12 +442,133 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til // SYCL Executor policy #ifdef EIGEN_USE_SYCL -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, SyclDevice, Vectorizable> { -public: - static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); +template <bool Vectorizable, typename Evaluator> +struct ExecExprFunctorKernel_impl { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast<Index>(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } +}; + +template <typename Evaluator> +struct ExecExprFunctorKernel_impl<true, Evaluator> { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast<Index>(itemID.get_global_linear_id()); + if (gId < vectorizable_threads) { + const Index PacketSize = Eigen::internal::unpacket_traits< + typename Evaluator::PacketReturnType>::size; + evaluator.evalPacket(gId * PacketSize); + gId += (vectorizable_threads * PacketSize); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += vectorizable_threads) { + evaluator.evalScalar(i); + } + } + } +}; + +template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator> +struct ExecExprFunctorKernel + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value, + Evaluator> { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value, + Evaluator>(range_, vectorizable_threads_, evaluator) {} +}; + +template <typename Expr, typename Evaluator> +struct ExecExprFunctorKernel<Expr, false, Evaluator> + : ExecExprFunctorKernel_impl<false, Evaluator> { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl<false, Evaluator>( + range_, vectorizable_threads_, evaluator) {} +}; + +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { + Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = Eigen::PacketType< + typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType, + Eigen::SyclDevice>::size; + Index vectorizable_threads = + static_cast<Index>(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + auto f = [&](cl::sycl::handler &cgh) { + evaluator.bind(cgh); + typedef ExecExprFunctorKernel<Expression, true, + Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>> + conditional_vectorized_kernel; + + typedef ExecExprFunctorKernel<Expression, false, + Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>> + non_vectorized_kernel; +// This is to make sure that an expression with a size less than vectorized size +// will not call the vectorized kernel. +// The reason for having this kernel is that the vectorisable parameter is a +// compile-time parameter, +// however, the size of a tensor is a run-time parameter + (vectorizable_threads) + ? cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel<vectorized_kernel>(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + conditional_vectorized_kernel(range, vectorizable_threads, + evaluator)) + : cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel<non_vectorized_kernel>(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + non_vectorized_kernel(range, vectorizable_threads, + evaluator)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); + dev.async_synchronize(e); + } + evaluator.cleanup(); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 4b6540c07..c9bccfc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -89,7 +89,10 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType; + typedef typename TypeConversion<Scalar, + typename XprTraits::PointerType + >::type + PointerType; }; template<typename UnaryOp, typename XprType> @@ -164,9 +167,10 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > static const int Layout = XprTraits::Layout; typedef typename TypeConversion<Scalar, typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, - typename traits<LhsXprType>::PointerType, - typename traits<RhsXprType>::PointerType>::type - >::type PointerType; + typename traits<LhsXprType>::PointerType, + typename traits<RhsXprType>::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -245,9 +249,10 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT static const int Layout = XprTraits::Layout; typedef typename TypeConversion<Scalar, typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val, - typename traits<Arg2XprType>::PointerType, - typename traits<Arg3XprType>::PointerType>::type - >::type PointerType; + typename traits<Arg2XprType>::PointerType, + typename traits<Arg3XprType>::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 480cf1f39..8d1a6d9cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -131,6 +131,8 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D typedef OutputScalar CoeffReturnType; typedef typename PacketType<OutputScalar, Device>::type PacketReturnType; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -167,13 +169,13 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); return false; } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); evalToBuf(m_data); return true; } @@ -202,11 +204,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value; ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); @@ -576,12 +583,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D protected: Index m_size; - const FFT& m_fft; + const FFT EIGEN_DEVICE_REF m_fft; Dimensions m_dimensions; array<Index, NumDims> m_strides; TensorEvaluator<ArgType, Device> m_impl; - CoeffReturnType* m_data; - const Device& m_device; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 186457a31..27a15bfb2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -78,9 +78,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn }; -template<typename ArgType, typename Device> -struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> +template<typename ArgType_, typename Device> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> { + typedef const typename internal::remove_all<ArgType_>::type ArgType; typedef TensorForcedEvalOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; @@ -88,6 +89,9 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -106,8 +110,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> TensorBlockReader; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + : m_impl(op.expression(), device), m_op(op.expression()), + m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } @@ -115,18 +119,19 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> #if !defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC #endif - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + #ifndef EIGEN_USE_SYCL // Should initialize the memory in case we're dealing with non POD types. if (NumTraits<CoeffReturnType>::RequireInitialization) { for (Index i = 0; i < numValues; ++i) { new(m_buffer+i) CoeffReturnType(); } } + #endif typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - + EvalTo evalToTmp(m_device.get(m_buffer), m_op); const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value; const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess && TensorEvaluator<const ArgType, Device>::PreferBlockAccess; @@ -165,17 +170,20 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buffer; } + EvaluatorPointerType data() const { return m_buffer; } - /// required by sycl in order to extract the sycl accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_buffer.bind(cgh); + m_impl.bind(cgh); + } +#endif private: TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_buffer; + const Device m_device; + EvaluatorPointerType m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 09b7c994b..ff8a19f87 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,17 +20,20 @@ namespace Eigen { // map_allocator. template<typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; }; -// The PointerType class is a container of the device specefic pointer -// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression +template <typename T> +EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast<T*>(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression // is a device-agnostic type and need MakePointer class for type conversion, -// the TensorEvaluator calls can be specialized for a device, hence it is possible +// the TensorEvaluator class can be specialized for a device, hence it is possible // to construct different types of temproray storage memory in TensorEvaluator -// for different devices by specializing the following PointerType class. -template<typename T, typename Device> struct PointerType : MakePointer<T>{}; +// for different devices by specializing the following StorageMemory class. +template<typename T, typename device> struct StorageMemory: MakePointer <T> {}; namespace internal{ template<typename A, typename B> struct Pointer_type_promotion { @@ -39,24 +42,10 @@ template<typename A, typename B> struct Pointer_type_promotion { template<typename A> struct Pointer_type_promotion<A, A> { static const bool val = true; }; -template<typename A, typename B> struct TypeConversion; -#ifndef __SYCL_DEVICE_ONLY__ -template<typename A, typename B> struct TypeConversion{ +template<typename A, typename B> struct TypeConversion { typedef A* type; }; -#endif -} - -#if defined(EIGEN_USE_SYCL) -namespace TensorSycl { -namespace internal{ -template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor; -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor; -} } -#endif - template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap; @@ -113,6 +102,31 @@ struct ThreadPoolDevice; struct GpuDevice; struct SyclDevice; +#ifdef EIGEN_USE_SYCL + +template <typename T> struct MakeSYCLPointer { + typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type; +}; + +template <typename T> +EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& +constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) { + return data; +} + +template <typename T> +struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {}; +template <typename T> +struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {}; + +namespace TensorSycl { +namespace internal{ +template <typename Evaluator, typename Op> class ReductionFunctor; +} +} +#endif + + enum FFTResultType { RealPart = 0, ImagPart = 1, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 51572f9e7..2edc45f1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -421,6 +421,7 @@ class GaussianGenerator { const array<T, NumDims>& std_devs) : m_means(means) { + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; } @@ -428,6 +429,7 @@ class GaussianGenerator { EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const { T tmp = T(0); + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 204a6fd33..b7ad33626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -88,6 +88,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), @@ -104,22 +106,21 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), m_generator(op.generator()) -#ifdef EIGEN_USE_SYCL - , m_argImpl(op.expression(), device) -#endif + : m_device(device), m_generator(op.generator()) { TensorEvaluator<ArgType, Device> argImpl(op.expression(), device); m_dimensions = argImpl.dimensions(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { m_strides[0] = 1; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); @@ -129,7 +130,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -234,11 +235,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> TensorOpCost::MulCost<Scalar>()); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} #endif protected: @@ -261,14 +262,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> } } - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; array<Index, NumDims> m_strides; array<IndexDivisor, NumDims> m_fast_strides; Generator m_generator; -#ifdef EIGEN_USE_SYCL - TensorEvaluator<ArgType, Device> m_argImpl; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 965bd8f1e..5ff67bdae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -154,23 +154,6 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT m_padding_left(padding_left), m_padding_right(padding_right), m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} -#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - bool padding_explicit, DenseIndex padding_top, DenseIndex padding_bottom, - DenseIndex padding_left, DenseIndex padding_right, PaddingType padding_type, - Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(padding_explicit), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(padding_type), m_padding_value(padding_value) {} - -#endif EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } @@ -242,6 +225,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -256,15 +241,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> typedef internal::TensorBlock<Scalar, Index, NumDims, Layout> OutputTensorBlock; -#ifdef __SYCL_DEVICE_ONLY__ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) - #else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) - #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_op(op) -#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -410,7 +388,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -516,13 +494,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } #ifdef EIGEN_USE_SYCL - // Required by SYCL in order to construct the expression tree on the device - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -693,6 +673,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -744,12 +725,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Scalar m_paddingValue; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator<ArgType, Device> m_impl; - #ifdef EIGEN_USE_SYCL - // Required for SYCL in order to construct the expression tree on the device - XprType m_op; - #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 32caccf87..be8f3a734 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -353,6 +353,7 @@ namespace internal { template<typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { Index result = 1; + EIGEN_UNROLL_LOOP for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { result *= sizes[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index e28565009..f8cda6574 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -86,6 +86,8 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, @@ -131,7 +133,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -146,6 +148,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> eigen_assert(index < dimensions().TotalSize()); *inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -160,6 +163,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> *inputIndex += index / m_strides[0]; return true; } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -195,6 +199,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -215,11 +220,13 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index b6d445c50..e7fec5d3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -37,7 +37,7 @@ namespace { { #ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; @@ -55,8 +55,8 @@ namespace { { #ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast<int>(cl::sycl::clz(val)); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -92,7 +92,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { #if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::mul_hi(a, static_cast<uint32_t>(b)); #else return (static_cast<uint64_t>(a) * b) >> 32; @@ -103,7 +103,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); #elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); @@ -124,7 +124,7 @@ namespace { template <typename T> struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -205,8 +205,8 @@ class TensorIntDivisor<int32_t, true> { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); -#elif defined(__SYCL_DEVICE_ONLY__) - return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift); #else uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); return (static_cast<uint32_t>(v >> 32) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 998757d14..755170a34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -134,13 +134,22 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -162,7 +171,9 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -192,7 +203,7 @@ template<typename ArgType, typename Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - + typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index c6ca396a3..3d859f42d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -59,4 +59,39 @@ #define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000)) #endif +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) + #define EIGEN_DEVICE_REF +#else + #define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#if defined(EIGEN_EXCEPTIONS) + #define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + try { X; } \ + catch(const cl::sycl::exception& e) { \ + std::cerr << "SYCL exception at " \ + << __FILE__ << ":" << __LINE__ << std::endl \ + << e.what() << std::endl; \ + std::rethrow_exception(std::current_exception()); \ + } \ + } while (false) +#else + #define EIGEN_SYCL_TRY_CATCH(X) X +#endif + +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 28f629080..395cdf9c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -31,8 +31,12 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin public: typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self; typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested<Self>::type Nested; - typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; + #ifdef EIGEN_USE_SYCL + typedef typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested; + #else + typedef typename Eigen::internal::nested<Self>::type Nested; + #endif + typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind; typedef typename internal::traits<PlainObjectType>::Index Index; typedef typename internal::traits<PlainObjectType>::Scalar Scalar; typedef typename NumTraits<Scalar>::Real RealScalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 87be090f9..6afc98877 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -85,9 +85,57 @@ struct PacketType<half, GpuDevice> { #endif #if defined(EIGEN_USE_SYCL) -template <typename T> - struct PacketType<T, SyclDevice> { - typedef T type; + +namespace TensorSycl { +namespace internal { + +template <typename Index, Index A, Index B> struct PlusOp { + static constexpr Index Value = A + B; +}; + +template <typename Index, Index A, Index B> struct DivOp { + static constexpr Index Value = A / B; +}; + +template <typename Index, Index start, Index end, Index step, + template <class Indx, Indx...> class StepOp> +struct static_for { + template <typename UnaryOperator> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for<Index, StepOp<Index, start, step>::Value, end, step, + StepOp>::loop(op); + } +}; +template <typename Index, Index end, Index step, + template <class Indx, Indx...> class StepOp> +struct static_for<Index, end, end, step, StepOp> { + template <typename UnaryOperator> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template <typename OutScalar, typename Device, bool Vectorizable> +struct Vectorise { + static const int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template <typename OutScalar, typename Device> +struct Vectorise<OutScalar, Device, true> { + static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size; + typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { + return ((((x) + (y)-1) / (y)) * (y)); +} + +} // namespace internal +} // namespace TensorSycl + +template <> + struct PacketType<half, SyclDevice> { + typedef half type; static const int size = 1; enum { HasAdd = 0, @@ -104,9 +152,59 @@ template <typename T> HasBlend = 0 }; }; -#endif +template <typename Scalar> +struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; + +}; + +template <typename Scalar> +struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ +template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \ +{\ + typedef typename internal::packet_traits<Type>::type type;\ + typedef typename internal::packet_traits<Type>::half half;\ +}; +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) + +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{}; +template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{}; +#endif +#endif + // Tuple mimics std::pair but works on e.g. nvcc. template <typename U, typename V> struct Tuple { public: @@ -124,7 +222,7 @@ template <typename U, typename V> struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { - #ifndef __SYCL_DEVICE_ONLY__ + #ifndef SYCL_DEVICE_ONLY if (&rhs == this) return *this; #endif first = rhs.first; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 5352c8f7b..8f6e987b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -37,7 +37,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp template<typename NewDimensions, typename XprType> struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> { - typedef const TensorReshapingOp<NewDimensions, XprType>& type; + typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type; }; template<typename NewDimensions, typename XprType> @@ -106,6 +106,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; static const int NumOutputDims = internal::array_size<Dimensions>::value; static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; @@ -168,7 +171,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -326,10 +329,18 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator<ArgType, Device> m_impl; NewDimensions m_dimensions; @@ -404,7 +415,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp template<typename StartIndices, typename Sizes, typename XprType> struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> { - typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type; + typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type; }; template<typename StartIndices, typename Sizes, typename XprType> @@ -488,7 +499,7 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_SYCL -template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> { +template <typename Index> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice> { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; @@ -508,6 +519,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef Sizes Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -575,7 +589,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data() @@ -599,10 +613,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi // Use memcpy if it's going to be faster than using the regular evaluation. const MemcpyTriggerForSlicing<Index, Device> trigger(m_device); if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -637,6 +651,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -648,6 +663,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -667,6 +683,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -698,8 +715,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { - Scalar* result = const_cast<Scalar*>(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -733,19 +750,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } +#endif + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -753,6 +770,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -767,7 +785,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; @@ -829,6 +847,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -840,6 +859,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -859,6 +879,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> internal::pstore<CoeffReturnType, PacketReturnType>(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } @@ -892,7 +913,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> { - typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type; + typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type; }; template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> @@ -969,6 +990,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; typedef Strides Dimensions; enum { @@ -985,8 +1008,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), - m_strides(op.strides()), m_exprStartIndices(op.startIndices()), - m_exprStopIndices(op.stopIndices()) + m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped; @@ -1069,7 +1091,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -1091,30 +1113,28 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;} - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -1125,7 +1145,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { -#ifndef __SYCL_DEVICE_ONLY__ +#ifndef SYCL_DEVICE_ONLY return numext::maxi(min, numext::mini(max,value)); #else return cl::sycl::clamp(value, min, max); @@ -1137,15 +1157,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, array<Index, NumDims> m_inputStrides; bool m_is_identity; TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes<Index, NumDims> m_startIndices; // clamped startIndices DSizes<Index, NumDims> m_dimensions; DSizes<Index, NumDims> m_offsets; // offset in a flattened shape const Strides m_strides; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 4837f2200..e98382cc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -92,6 +92,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -138,7 +140,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -151,6 +153,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (isPaddingAtIndexForDim(idx, i)) { @@ -164,6 +167,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } inputIndex += (index - m_padding[0].first); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (isPaddingAtIndexForDim(idx, i)) { @@ -192,23 +196,25 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); } else { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); } return cost; } - EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif private: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( @@ -272,6 +278,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index initialIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index firstIdx = index; const Index lastIdx = index + PacketSize - 1; @@ -329,7 +336,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index initialIndex = index; Index inputIndex = 0; - + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index firstIdx = index; const Index lastIdx = index + PacketSize - 1; @@ -383,6 +390,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 4292fe0c2..47db839db 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -89,6 +89,8 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { @@ -103,9 +105,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_patch_dims(op.patch_dims()) -#endif { Index num_patches = 1; const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -149,7 +148,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -167,6 +166,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -175,6 +175,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -202,6 +203,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> Index inputIndices[2] = {0, 0}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -217,6 +219,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -243,6 +246,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -259,11 +263,13 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif protected: @@ -274,9 +280,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> TensorEvaluator<ArgType, Device> m_impl; -#ifdef EIGEN_USE_SYCL - const PatchDim m_patch_dims; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 787cbd031..2be4f9cc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -44,6 +45,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { uint64_t rnd = ::random() ^ mach_absolute_time(); return rnd; + #else // Augment the current time with pseudo random number generation // to ensure that we get different seeds if we try to generate seeds @@ -147,14 +149,41 @@ template <typename T> class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two step to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; + #ifdef EIGEN_USE_SYCL + m_exec_once =other.m_exec_once; + #endif } template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif T result = RandomToTypeUniform<T>(&m_state, i); return result; } @@ -163,6 +192,14 @@ template <typename T> class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { values[j] = RandomToTypeUniform<T>(&m_state, i); } @@ -171,6 +208,9 @@ template <typename T> class UniformRandomGenerator { private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; template <typename Scalar> @@ -222,14 +262,37 @@ template <typename T> class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once=other.m_exec_once; +#endif } template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif T result = RandomToTypeNormal<T>(&m_state, i); return result; } @@ -238,6 +301,14 @@ template <typename T> class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { values[j] = RandomToTypeNormal<T>(&m_state, i); } @@ -246,6 +317,9 @@ template <typename T> class NormalRandomGenerator { private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index bb63433fe..5dddfcf85 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -299,7 +299,7 @@ template <typename Self, typename Op, typename Device, bool Vectorizable = (Self struct FullReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); } @@ -400,6 +400,18 @@ struct OuterReducer { } }; +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template <typename Self, typename Op, typename Device> +struct GenericReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template <int B, int N, typename S, typename R, typename I_> @@ -423,6 +435,23 @@ template <int NPT, typename S, typename R, typename I_> __global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template <typename Op, typename CoeffReturnType> +struct ReductionReturnType { +#if EIGEN_HAS_CXX11 && defined(EIGEN_USE_SYCL) + typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type; +#else + typedef typename remove_const<CoeffReturnType>::type type; +#endif +}; + template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)> @@ -520,12 +549,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, const Op m_reducer; }; +template<typename ArgType, typename Device> +struct TensorReductionEvaluatorBase; // Eval as rvalue template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> -struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> +struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> { typedef internal::reducer_traits<Op, Device> ReducerTraits; + typedef Dims ReducedDims; typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -535,12 +567,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, static const int NumOutputDims = NumInputDims - NumReducedDims; typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions; typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self; + typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self; static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const Index PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; + enum { IsAligned = false, PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, @@ -562,11 +602,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) -#if defined(EIGEN_USE_SYCL) - , m_xpr_dims(op.dims()) -#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -653,7 +690,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, // of which will eventually result in an NVCC error EIGEN_DEVICE_FUNC #endif - bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) { + bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -663,7 +700,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, !RunningOnGPU))) { bool need_assign = false; if (!data) { - m_result = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType))); + m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); data = m_result; need_assign = true; } @@ -671,20 +708,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data); return need_assign; } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -698,8 +724,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { + data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -707,6 +733,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { m_device.deallocate_temp(m_result); @@ -731,8 +758,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { + data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -740,6 +767,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { m_device.deallocate_temp(m_result); @@ -750,6 +778,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, return (m_result != NULL); } } + #if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if(RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + #endif } return true; } @@ -764,7 +807,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + if (( RunningFullReduction || RunningOnGPU) && m_result ) { return *(m_result + index); } Op reducer(m_reducer); @@ -1097,12 +1140,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, m_device.deallocate(reducers); } - EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; } - -#if defined(EIGEN_USE_SYCL) - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - const Device& device() const { return m_device; } - const Dims& xprDims() const { return m_xpr_dims; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_result.bind(cgh); + } #endif private: @@ -1126,8 +1172,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, #endif #if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor; + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template <typename, typename, typename> friend struct internal::GenericReducer; #endif @@ -1255,9 +1302,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, // Precomputed strides for the output tensor. array<Index, NumOutputDims> m_outputStrides; array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; array<Index, NumPreservedStrides> m_preservedStrides; // Map from output to input dimension index. array<Index, NumOutputDims> m_output_to_input_dim_map; @@ -1288,13 +1332,36 @@ static const bool RunningOnGPU = false; static const bool RunningOnGPU = false; static const bool RunningOnSycl = false; #endif - typename MakePointer_<CoeffReturnType>::Type m_result; + EvaluatorPointerType m_result; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; +}; -#if defined(EIGEN_USE_SYCL) - const Dims m_xpr_dims; -#endif +template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> +struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> +: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> { + typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} +}; + + +template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_> +struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> +: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> { + + typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the packet function should be overridden by for SYCL kernel + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload<typename Base::PacketReturnType>(this->data() + index); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 6e15e75f9..b92c9ffaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t public: // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions; typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef TensorEvaluator<Expr, Device> EvalType; TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { m_dims = m_impl.dimensions(); @@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension public: typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base; typedef typename Base::Scalar Scalar; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { } @@ -362,6 +367,8 @@ struct TensorEvaluator<const TensorRef<Derived>, Device> typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -379,7 +386,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } @@ -394,7 +401,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device> } EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); } - + protected: TensorRef<Derived> m_ref; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 33af7d995..64fed6de7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -156,7 +158,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -169,6 +171,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; @@ -183,6 +186,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device inputIndex += index; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; @@ -216,6 +220,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device // local structure. EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -361,12 +366,14 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from sycl device - ReverseDimensions functor() const { return m_reverse; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; @@ -407,7 +414,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -423,11 +430,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> // This code is pilfered from TensorMorphing.h EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 64f10d0a4..44156126d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -86,12 +86,15 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { typedef TensorScanOp<Op, ArgType> XprType; typedef typename XprType::Index Index; + typedef const ArgType ChildType; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -110,7 +113,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { m_exclusive(op.exclusive()), m_accumulator(op.accumulator()), m_size(m_impl.dimensions()[op.axis()]), - m_stride(1), + m_stride(1), m_consume_dim(op.axis()), m_output(NULL) { // Accumulating a scalar isn't supported. @@ -142,6 +145,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return m_stride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { + return m_consume_dim; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; } @@ -162,7 +169,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return m_device; } - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); ScanLauncher<Self, Op, Device> launcher; if (data) { @@ -171,7 +178,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { } const Index total_size = internal::array_prod(dimensions()); - m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar))); + m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar)))); launcher(*this, m_output); return true; } @@ -181,7 +188,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; } @@ -196,21 +203,29 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_output != NULL) { - m_device.deallocate(m_output); + if (m_output) { + m_device.deallocate_temp(m_output); m_output = NULL; } m_impl.cleanup(); } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_output.bind(cgh); + } +#endif protected: TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; const bool m_exclusive; Op m_accumulator; const Index m_size; Index m_stride; - CoeffReturnType* m_output; + Index m_consume_dim; + EvaluatorPointerType m_output; }; // CPU implementation of scan diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index b577d4d36..ad6332179 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -130,8 +132,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), - m_impl(op.expression(), device), - m_shuffle(op.shufflePermutation()) + m_impl(op.expression(), device) { const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -172,7 +173,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -194,6 +195,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = self.coeff(index + i); } @@ -210,6 +212,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> return self.m_impl.template packet<LoadMode>(index); } else { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = self.coeff(index + i); } @@ -337,13 +340,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - - // required by sycl - EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} - // required by sycl - EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;} + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( Index input_index, @@ -397,10 +401,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_unshuffledInputStrides; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator<ArgType, Device> m_impl; - /// required by sycl - Shuffle m_shuffle; }; @@ -452,6 +454,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 221dc96c9..3b1cbaabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -37,7 +37,7 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> template<typename Strides, typename XprType> struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> { - typedef const TensorStridingOp<Strides, XprType>& type; + typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type; }; template<typename Strides, typename XprType> @@ -108,6 +108,8 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, @@ -120,7 +122,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) + : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { @@ -149,9 +151,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -173,6 +176,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -184,6 +188,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * m_inputStrides[0]; inputIndices[1] += indices[1] * m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -203,6 +208,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -225,18 +231,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - Strides functor() const { return m_strides; } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -244,6 +252,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } inputIndex += index * m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -258,7 +267,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; - const Strides m_strides; }; // Eval as lvalue @@ -296,11 +304,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -310,6 +313,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; @@ -321,6 +325,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * this->m_inputStrides[0]; inputIndices[1] += indices[1] * this->m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; @@ -340,6 +345,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> internal::pstore<Scalar, PacketReturnType>(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 9fc54a4c0..d04b1bea7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -91,6 +91,8 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -205,7 +207,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -249,6 +251,13 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> return result; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: // Given the output index, finds the first index in the input tensor used to compute the trace EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -276,7 +285,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> TensorEvaluator<ArgType, Device> m_impl; // Initialize the size of the trace dimension Index m_traceDim; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; array<bool, NumInputDims> m_reduced; array<Index, NumReducedDims> m_reducedDims; array<Index, NumOutputDims> m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 0a394c88d..4aec83cff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -58,9 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; - }; typedef typename MakePointer<Scalar>::Type PointerType; }; @@ -80,9 +77,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; - }; typedef typename MakePointer<Scalar>::Type PointerType; }; @@ -106,10 +100,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - typedef typename MakePointerT::ScalarType ScalarType; - - }; typedef typename MakePointer<Scalar>::Type PointerType; }; @@ -135,49 +125,49 @@ struct traits<TensorRef<PlainObjectType> > template<typename _Scalar, int NumIndices_, int Options, typename IndexType_> struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; }; template<typename _Scalar, int NumIndices_, int Options, typename IndexType_> struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; }; template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> { - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; }; template<typename Scalar_, typename Dimensions, int Options, typename IndexType_> struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> { - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; }; template<typename PlainObjectType, int Options, template <class> class MakePointer> struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> { - typedef const TensorMap<PlainObjectType, Options, MakePointer>& type; + typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; }; template<typename PlainObjectType, int Options, template <class> class MakePointer> struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> { - typedef const TensorMap<PlainObjectType, Options, MakePointer>& type; + typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; }; template<typename PlainObjectType> struct eval<TensorRef<PlainObjectType>, Eigen::Dense> { - typedef const TensorRef<PlainObjectType>& type; + typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; }; template<typename PlainObjectType> struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> { - typedef const TensorRef<PlainObjectType>& type; + typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; }; // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. @@ -189,50 +179,50 @@ template<typename T, int n=1, typename PlainObject = void> struct nested template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_> struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > { - typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type; + typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type; }; template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_> struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> > { - typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type; + typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type; }; template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > { - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; }; template <typename Scalar_, typename Dimensions, int Options, typename IndexType_> struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > { - typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type; + typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type; }; template <typename PlainObjectType, int Options, template <class> class MakePointer> struct nested<TensorMap<PlainObjectType, Options, MakePointer> > { - typedef const TensorMap<PlainObjectType, Options, MakePointer>& type; + typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; }; template <typename PlainObjectType, int Options, template <class> class MakePointer> struct nested<const TensorMap<PlainObjectType, Options, MakePointer> > { - typedef const TensorMap<PlainObjectType, Options, MakePointer>& type; + typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type; }; template <typename PlainObjectType> struct nested<TensorRef<PlainObjectType> > { - typedef const TensorRef<PlainObjectType>& type; + typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; }; template <typename PlainObjectType> struct nested<const TensorRef<PlainObjectType> > { - typedef const TensorRef<PlainObjectType>& type; + typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type; }; } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index c1b7a58ca..29a2d5538 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -91,24 +91,6 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, m_padding_left(padding_left), m_padding_right(padding_right), m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} -#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - bool padding_explicit, DenseIndex padding_top_z, DenseIndex padding_bottom_z, - DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, - DenseIndex padding_right, PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), - m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), - m_col_inflate_strides(col_inflate_strides), m_padding_explicit(padding_explicit), m_padding_top_z(padding_top_z), - m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), m_padding_left(padding_left), - m_padding_right(padding_right), m_padding_type(padding_type), m_padding_value(padding_value) {} - -#endif - EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; } EIGEN_DEVICE_FUNC @@ -195,6 +177,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -205,16 +189,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D CoordAccess = false, RawAccess = false }; -#ifdef __SYCL_DEVICE_ONLY__ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) -#else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) -#endif - : m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_op(op) -#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : + m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -368,9 +345,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -531,14 +509,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } -#ifdef EIGEN_USE_SYCL - // Required by SYCL in order to construct the expression on the device - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } -#endif Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -556,10 +530,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -635,10 +616,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D TensorEvaluator<ArgType, Device> m_impl; -#ifdef EIGEN_USE_SYCL -// Required by SYCL in order to construct the expression on the device - XprType m_op; -#endif }; |