From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 28 Jun 2019 10:08:23 +0100 Subject: [SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL. * Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 49 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 19 +- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 33 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 38 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 25 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 15 +- .../CXX11/src/Tensor/TensorContractionMapper.h | 49 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 32 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 54 +- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 4 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1155 +++++++++++++------- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 34 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 228 ++-- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 133 ++- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 19 +- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 23 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 37 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 58 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 2 + .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 22 +- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 45 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 1 + .../Eigen/CXX11/src/Tensor/TensorInflation.h | 15 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 17 +- unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h | 35 + unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 108 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 94 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 26 +- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 23 +- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 74 ++ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 153 ++- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 11 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 23 +- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 31 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 27 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 36 +- unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 13 +- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 42 +- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 51 +- unsupported/test/cxx11_tensor_executor.cpp | 22 +- unsupported/test/cxx11_tensor_morphing.cpp | 3 +- 47 files changed, 1979 insertions(+), 947 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 6f7c6d86d..05e7963f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -37,7 +37,7 @@ struct traits > : public traits template struct eval, Eigen::Dense> { - typedef const TensorIndexTupleOp& type; + typedef const TensorIndexTupleOpEIGEN_DEVICE_REF type; }; template @@ -82,6 +82,8 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, @@ -100,7 +102,7 @@ struct TensorEvaluator, Device> return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -118,11 +120,11 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { - return m_impl; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } #endif @@ -154,7 +156,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorTupleReducerOp& type; + typedef const TensorTupleReducerOpEIGEN_DEVICE_REF type; }; template @@ -216,6 +218,9 @@ struct TensorEvaluator, Devi typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; static const int NumDims = internal::array_size::value; typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory TupleStorageMem; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, @@ -231,9 +236,6 @@ struct TensorEvaluator, Devi : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), m_return_dim(op.return_dim()) -#ifdef EIGEN_USE_SYCL - ,m_device(device) -#endif { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { @@ -242,15 +244,18 @@ struct TensorEvaluator, Devi } else { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = (m_return_dim >= 0) ? m_strides[m_return_dim] : 1; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = ((m_return_dim >= 0) && + (m_return_dim < static_cast(m_strides.size()))) + ? m_strides[m_return_dim] : 1; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -263,16 +268,13 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } - #ifndef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - #else // following functions are required by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} - const Device& device() const{return m_device;} - #endif + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_orig_impl.bind(cgh); + } +#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -312,9 +314,6 @@ struct TensorEvaluator, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; -#ifdef EIGEN_USE_SYCL - const Device& m_device; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 06bf422c5..d6e51bc6c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -97,6 +97,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int PacketSize = PacketType::size; static const int NumDims = XprType::NumDims; @@ -136,7 +138,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -154,6 +156,7 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); @@ -199,13 +202,15 @@ struct TensorEvaluator, Device> m_leftImpl.writeBlock(*block); } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } +#endif - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator m_leftImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 0db637405..38c06aba2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -841,7 +841,7 @@ struct TensorBlockView { const Scalar* data() const { return m_data; } private: - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_block_sizes, m_block_strides; const Scalar* m_data; // Not owned. Scalar* m_allocated_data; // Owned. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index c102a43fb..10bdbc6a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -37,7 +37,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorBroadcastingOp& type; + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; }; template @@ -105,7 +105,11 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; bool isCopy, nByOne, oneByN; + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -205,7 +209,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -238,6 +242,7 @@ struct TensorEvaluator, Device> // TODO: attempt to speed this up. The integer divisions and modulo are slow EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -272,6 +277,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -376,6 +382,7 @@ struct TensorEvaluator, Device> values[0] = m_impl.coeff(inputIndex); return internal::pload1(values); } else { + EIGEN_UNROLL_LOOP for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[endDim]) { values[i] = m_impl.coeff(inputIndex); @@ -410,6 +417,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { if (inputIndex > m_inputStrides[dim]-1) { inputIndex = 0; @@ -441,6 +449,7 @@ struct TensorEvaluator, Device> values[0] = m_impl.coeff(inputIndex); return internal::pload1(values); } else { + EIGEN_UNROLL_LOOP for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { if (outputOffset + cur < m_outputStrides[dim]) { values[i] = m_impl.coeff(inputIndex); @@ -465,6 +474,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -500,6 +510,7 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[0]) { values[i] = m_impl.coeff(inputIndex+i); @@ -521,6 +532,7 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -556,6 +568,7 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { values[i] = m_impl.coeff(inputIndex+i); @@ -572,6 +585,7 @@ struct TensorEvaluator, Device> costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost(); if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); if (internal::index_statically_eq(i, 1)) { @@ -845,12 +859,17 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } - + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( const Dimensions& input_block_sizes, @@ -874,9 +893,9 @@ struct TensorEvaluator, Device> BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); } - protected: - const Device& m_device; - const Broadcast m_broadcast; +protected: + const Device EIGEN_DEVICE_REF m_device; + const typename internal::remove_reference::type m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 0b5d4127b..7afaf0f33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -38,7 +38,7 @@ struct traits > : public traits template struct eval, Eigen::Dense> { - typedef const TensorChippingOp& type; + typedef const TensorChippingOp EIGEN_DEVICE_REF type; }; template @@ -139,7 +139,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; - + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -169,7 +170,7 @@ struct TensorEvaluator, Device> OutputTensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -218,7 +219,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -243,6 +244,7 @@ struct TensorEvaluator, Device> eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; @@ -262,6 +264,7 @@ struct TensorEvaluator, Device> } else { // Cross the stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; @@ -349,26 +352,20 @@ struct TensorEvaluator, Device> m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { - CoeffReturnType* result = const_cast(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (IsOuterChipping && result) { return result + m_inputOffset; } else { return NULL; } } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { - return m_dim.actualDim(); +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { - return m_offset; - } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const @@ -399,10 +396,7 @@ struct TensorEvaluator, Device> DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; - const Device& m_device; -// required by sycl - const DenseIndex m_offset; - + const Device EIGEN_DEVICE_REF m_device; }; @@ -466,6 +460,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; @@ -484,6 +479,7 @@ struct TensorEvaluator, Device> // Cross stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 3863ee8c3..292a1bae1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -119,6 +119,8 @@ struct TensorEvaluator::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, @@ -181,7 +183,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } } else { left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } @@ -235,11 +239,13 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } } else { right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } @@ -257,6 +263,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - const Axis& axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 4ddcd982e..de7c2248a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -433,6 +433,8 @@ struct TensorContractionEvaluatorBase typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -452,6 +454,9 @@ struct TensorContractionEvaluatorBase static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; static const int LDims = internal::array_size::Dimensions>::value; @@ -653,14 +658,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { evalTo(data); return false; } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } @@ -934,7 +939,7 @@ struct TensorContractionEvaluatorBase return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array, ContractDims>& eval_op_indices) { @@ -1169,9 +1174,9 @@ protected: TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; OutputKernelType m_output_kernel; - Scalar* m_result; + EvaluatorPointerType m_result; bool m_can_use_xsmm; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 1be823fd1..50865d404 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -59,6 +59,13 @@ struct CoeffLoader { return m_tensor.template packet(index); } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + private: const Tensor m_tensor; }; @@ -87,6 +94,13 @@ struct CoeffLoader { { return internal::ploadt_ro(m_data + index); } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } + #endif private: typedef typename Tensor::Scalar Scalar; @@ -139,6 +153,7 @@ class SimpleTensorContractionMapper { EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = nocontract_val / m_ij_strides[i]; linidx += idx * m_nocontract_strides[i]; @@ -155,6 +170,7 @@ class SimpleTensorContractionMapper { Index contract_val = left ? col : row; if(array_size::value > 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = contract_val / m_k_strides[i]; linidx += idx * m_contract_strides[i]; @@ -179,6 +195,7 @@ class SimpleTensorContractionMapper { Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = nocontract_val[0] / m_ij_strides[i]; const Index idx1 = nocontract_val[1] / m_ij_strides[i]; @@ -199,6 +216,7 @@ class SimpleTensorContractionMapper { Index contract_val[2] = {left ? col : row, left ? col : row + distance}; if (array_size::value> 0) { + EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = contract_val[0] / m_k_strides[i]; const Index idx1 = contract_val[1] / m_k_strides[i]; @@ -230,6 +248,13 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + const CoeffLoader& tensor() const { return m_tensor; } @@ -302,6 +327,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapperm_tensor.coeff(first); + EIGEN_UNROLL_LOOP for (Index k = 1; k < packet_size - 1; k += 2) { const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); data[k] = this->m_tensor.coeff(internal_pair.first); @@ -472,6 +498,13 @@ class TensorContractionSubMapper { return false; } + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_base_mapper.bind(cgh); + } + #endif + const ParentMapper& base_mapper() const { return m_base_mapper; } Index vert_offset() const { return m_vert_offset; } Index horiz_offset() const { return m_horiz_offset; } @@ -515,6 +548,22 @@ class TensorContractionInputMapper }; +template struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait > { + + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; + }; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 938fd0f34..e96f31537 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -129,6 +129,7 @@ struct PacketConverter { typedef typename internal::unpacket_traits::type TgtType; internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < TgtPacketSize; ++i) { values[i] = converter(m_impl.coeff(index+i)); } @@ -164,15 +165,15 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; @@ -207,6 +208,7 @@ struct PacketConv { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = converter(impl.coeff(index+i)); } @@ -267,10 +269,18 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketSourceType; static const int PacketSize = PacketType::size; static const bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, - PacketAccess = true, + PacketAccess = + #ifndef EIGEN_USE_SYCL + true, + #else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, + #endif BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -284,9 +294,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - return ConversionSubExprEval, Scalar>::run(m_impl, data); + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() @@ -330,10 +340,16 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 2d0e6599f..25e1e5896 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -303,6 +303,8 @@ struct TensorEvaluator::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, @@ -469,7 +471,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -525,7 +527,7 @@ struct TensorEvaluator EvalTo; EvalTo evalToTmp(local, m_kernelArg); const bool Vectorize = internal::IsVectorizable::value; @@ -548,7 +550,7 @@ struct TensorEvaluator > template struct eval, Eigen::Dense> { - typedef const TensorCustomUnaryOp& type; + typedef const TensorCustomUnaryOpEIGEN_DEVICE_REF type; }; template @@ -88,7 +88,9 @@ struct TensorEvaluator, Devi typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; - typedef typename PointerType::Type PointerT; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -108,20 +110,20 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast( - m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); evalTo(m_result); return true; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { + if (m_result) { m_device.deallocate_temp(m_result); m_result = NULL; } @@ -141,22 +143,25 @@ struct TensorEvaluator, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC PointerT data() const { return m_result; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } #endif protected: - EIGEN_DEVICE_FUNC void evalTo(PointerT data) { - TensorMap > result(data, m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } Dimensions m_dimensions; const ArgType m_op; - const Device& m_device; - PointerT m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; @@ -251,7 +256,10 @@ struct TensorEvaluator::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; - typedef typename PointerType::Type PointerT; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -271,12 +279,13 @@ struct TensorEvaluator(m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))); + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); evalTo(m_result); return true; } @@ -303,22 +312,25 @@ struct TensorEvaluator > result(data, m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } Dimensions m_dimensions; const XprType m_op; - const Device& m_device; - PointerT m_result; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 8cb95f731..46b9d3ab2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -39,6 +39,10 @@ struct DefaultDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { #if !defined(EIGEN_GPU_COMPILE_PHASE) @@ -54,7 +58,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running on the host CPU return l1CacheSize(); #elif defined(EIGEN_HIP_DEVICE_COMPILE) @@ -67,7 +71,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) // Running single threaded on the host CPU return l3CacheSize(); #elif defined(EIGEN_HIP_DEVICE_COMPILE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 83cde6afb..ebf85c072 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -215,6 +215,10 @@ struct GpuDevice { stream_->deallocate(buffer); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e7beb2c82..93efe2f82 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,498 +14,879 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -template struct CheckAlignStatically { - static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); -}; -template -struct Conditional_Allocate { +#include - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) { - return aligned_alloc(Align, elements); - } -}; -template -struct Conditional_Allocate { +namespace Eigen { - EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ - return malloc(elements); - } +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info()), + platform_name(queue.get_device() + .get_platform() + .template get_info()), + device_name(queue.get_device() + .template get_info()), + device_vendor( + queue.get_device() + .template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; }; -template > -struct SyclAllocator { - typedef Scalar value_type; - typedef typename std::allocator_traits::pointer pointer; - typedef typename std::allocator_traits::size_type size_type; - SyclAllocator( ){}; - Scalar* allocate(std::size_t elements) { - return static_cast(Conditional_Allocate::Val, Align>::conditional_allocate(elements)); +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } } - void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } -}; - -namespace Eigen { + return supported_devices; +#endif +} -#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast(static_cast(((buf_acc.get_pointer().get())))) -#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task(m_prog.get_kernel(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} - template class MemCopyFunctor { - public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + std::lock_guard lock(pmapper_mutex_); + return static_cast(pMapper.add_pointer(buf)); + } - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - -template - struct memsetkernelFunctor{ - AccType m_acc; - const ptrdiff_t buff_offset; - const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} - void operator()(cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; - } - - }; - -struct memsetCghFunctor{ - cl::sycl::buffer >& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); } -}; -//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) -EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ -std::vector supported_devices; -auto plafrom_list =cl::sycl::platform::get_platforms(); -for(const auto& platform : plafrom_list){ - auto device_list = platform.get_devices(); - auto platform_name =platform.template get_info(); - std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); - for(const auto& device : device_list){ - auto vendor = device.template get_info(); - std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) || - (device.is_gpu() && platform_name.find("intel")!=std::string::npos); - if(!unsuported_condition){ - std::cout << "Platform name "<< platform_name << std::endl; - supported_devices.push_back(device); + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; } +#endif + std::lock_guard lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } -} -return supported_devices; -} -class QueueInterface { -public: - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template explicit QueueInterface(const dev_Selector& s): -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; } - } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } - })) #else -m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - std::cerr << "Error detected Inside Sycl Device."<< std::endl; - - } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); +#endif + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor(data); } -})) + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return static_cast(data.get_virtual_pointer()); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); #endif - {} - - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. - /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key - /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we - /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. - /// The device pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - std::lock_guard lock(mutex_); - auto buf = cl::sycl::buffer >(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access().get_pointer(); - buf.set_final_data(nullptr); - buffer_map.insert(std::pair > >(static_cast(ptr),buf)); - return static_cast(ptr); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &p) const { + deallocate_temp(p.get_virtual_pointer()); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard lock(mutex_); - auto it = buffer_map.find(static_cast(p)); - if (it != buffer_map.end()) { - buffer_map.erase(it); - } + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); } EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard lock(mutex_); - buffer_map.clear(); - } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with write mode - /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. - /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark - /// Also, this is faster as it uses the map_allocator instead of memcpy - template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto it =find_buffer(dst); - auto offset =static_cast(static_cast(dst))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto src_buf = cl::sycl::buffer >(static_cast(static_cast(const_cast(src))), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto dst_acc= it->second.template get_access(cgh); - auto src_acc =src_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, offset, 0)); - }); - synchronize(); - } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it =find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - } - - /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = find_buffer(static_cast(src)); - auto it2 = find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif } + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + write_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + read_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor(cgh, src, n); + buffer_scalar_t *ptr = static_cast(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor(cgh, src, n); + auto dst_acc = get_range_accessor(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = find_buffer(static_cast(data)); - ptrdiff_t buff_offset= (static_cast(data)) - it1->first; - m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast(static_cast(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast(typed_offset), + reinterpret_cast(start_ptr)); + } + + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); } /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. - template EIGEN_STRONG_INLINE cl::sycl::accessor - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (find_buffer(ptr)->second.template get_access(cgh)); + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access( + cgh); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { - return find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return (static_cast(ptr))-(find_buffer(ptr)->first); + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_offset(ptr); } EIGEN_STRONG_INLINE void synchronize() const { - m_queue.wait_and_throw(); //pass +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif } - EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled - m_queue.wait_and_throw(); //pass + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif } - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(m_queue.get_device(). template get_info()); - auto s= m_queue.get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); if (xMode != 0) GRange += static_cast(tileSize - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); + tileSize1 = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); + tileSize0 = static_cast(max_workgroup_Size / tileSize1); rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); + tileSize2 = + static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + rng2 = dim2; + if (rng2 == 0) rng1 = static_cast(1); + GRange2 = rng2; + if (tileSize2 > GRange2) + tileSize2 = GRange2; + else if (GRange2 > tileSize2) { + Index xMode = static_cast(GRange2 % tileSize2); if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); + pow_of_2 = static_cast( + std::log2(static_cast(max_workgroup_Size / tileSize2))); + tileSize1 = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + rng1 = dim1; + if (rng1 == 0) rng1 = static_cast(1); + GRange1 = rng1; + if (tileSize1 > GRange1) + tileSize1 = GRange1; + else if (GRange1 > tileSize1) { + Index xMode = static_cast(GRange1 % tileSize1); if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + tileSize0 = + static_cast(max_workgroup_Size / (tileSize1 * tileSize2)); rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); + if (rng0 == 0) rng0 = static_cast(1); + GRange0 = rng0; + if (tileSize0 > GRange0) + tileSize0 = GRange0; + else if (GRange0 > tileSize0) { + Index xMode = static_cast(GRange0 % tileSize0); if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue.get_device(). template get_info(); + return m_device_info.max_compute_units; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue.get_device(). template get_info(); + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept + // OpenCL doesnot have such concept return 2; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue.get_device(). template get_info(); + return m_device_info.local_mem_size; } - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { if (!exception_caught_) { - m_queue.wait_and_throw(); + synchronize(); } return !exception_caught_; } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + // destructor - ~QueueInterface() { buffer_map.clear(); } + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list l) const { + bool exception_caught = false; + for (const auto &e : l) { + if (e) { + exception_caught = true; +#ifdef EIGEN_EXCEPTIONS + try { + std::rethrow_exception(e); + } catch (const cl::sycl::exception &e) { + std::cerr << e.what() << std::endl; + } +#else + std::cerr << "Error detected inside Sycl device." << std::endl; + abort(); +#endif + } + } + return exception_caught; + } -private: /// class members: bool exception_caught_ = false; - mutable std::mutex mutex_; + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map latest_events_; +#endif /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map > > buffer_map; + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set scratch_buffers; +#endif /// sycl queue mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif - EIGEN_STRONG_INLINE std::map > >::iterator find_buffer(const void* ptr) const { - std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; } }; -// Here is a sycl deviuce struct which accept the sycl queue interface +// Here is a sycl device struct which accept the sycl queue interface // as an input -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor(ptr); + } // get sycl accessor - template EIGEN_STRONG_INLINE cl::sycl::accessor - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return m_queue_stream->template get_sycl_accessor(cgh, ptr); + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->get_sycl_buffer(ptr); + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, + Index &tileSize0, + Index &tileSize1, Index &rng0, + Index &rng1, Index &GRange0, + Index &GRange1) const { + queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, + rng1, GRange0, GRange1); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, + Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, + Index &GRange1, Index &GRange2) const { + queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, + tileSize2, rng0, rng1, rng2, GRange0, + GRange1, GRange2); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); - - } - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return m_queue_stream->allocate(num_bytes); - } /// deallocate device memory EIGEN_STRONG_INLINE void deallocate(void *p) const { - m_queue_stream->deallocate(p); - } + queue_stream()->deallocate(p); + } - // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } - /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpy(dst,src,n); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return queue_stream()->get(data); } + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return m_queue_stream->get_offset(ptr); + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); } -// memcpyHostToDevice - template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyHostToDevice(dst,src,n); + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); } -/// here is the memcpyDeviceToHost - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - m_queue_stream->memcpyDeviceToHost(dst,src,n); + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); } - /// Here is the implementation of memset function on sycl. + /// the memset function EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - m_queue_stream->memset(data,c,n); + queue_stream()->memset(data, c, n); } /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and @@ -513,40 +894,64 @@ struct SyclDevice { return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_queue_stream->getNumSyclMultiProcessors(); + return queue_stream()->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_queue_stream->maxSyclThreadsPerBlock(); + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesn't have such concept - return m_queue_stream->maxSyclThreadsPerMultiProcessor(); - // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_queue_stream->sharedMemPerBlock(); + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } EIGEN_STRONG_INLINE void synchronize() const { - m_queue_stream->synchronize(); //pass + queue_stream()->synchronize(); } - - EIGEN_STRONG_INLINE void asynchronousExec() const { - m_queue_stream->asynchronousExec(); + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); } + // This function checks if the runtime recorded an error for the // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - return m_queue_stream->ok(); + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); } }; -// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. -// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator -// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation -struct SyclKernelDevice:DefaultDevice{}; - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index b43db40c8..ef22a268a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -75,6 +75,11 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { #ifdef __ANDROID__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 554ee5f59..576a4f3ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -42,7 +42,6 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; }; @@ -103,7 +102,9 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; - + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, @@ -115,22 +116,16 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename internal::traits >::template MakePointer::Type DevicePointer; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); @@ -165,19 +160,20 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } ArgType expression() const { return m_expression; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_buffer.bind(cgh); + } + #endif - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} private: TensorEvaluator m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; + EvaluatorPointerType m_buffer; const ArgType m_expression; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 7f0f4acbc..1d48b5eed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -34,6 +34,9 @@ struct TensorEvaluator typedef typename Derived::Dimensions Dimensions; typedef Derived XprType; static const int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? @@ -60,16 +63,17 @@ struct TensorEvaluator TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + : m_data(device.get((const_cast(m.data())))), + m_dims(m.dimensions()), + m_device(device) { } - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { if (!NumTraits::type>::RequireInitialization && dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; @@ -78,14 +82,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits::template MakePointer::RefType - coeffRef(Index index) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); return m_data[index]; } @@ -114,7 +116,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -122,10 +124,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits::template MakePointer::RefType + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -152,16 +153,18 @@ struct TensorEvaluator TensorBlockWriter::Run(block, m_data); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device m_device; }; namespace { @@ -184,6 +187,13 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { + return *address; +} +#endif } @@ -197,7 +207,9 @@ struct TensorEvaluator typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; typedef const Derived XprType; - + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? @@ -221,18 +233,15 @@ struct TensorEvaluator typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlockReader; - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; @@ -241,13 +250,8 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); -#ifndef __SYCL_DEVICE_ONLY__ + eigen_assert(m_data != NULL); return loadConstant(m_data+index); -#else - CoeffReturnType tmp = m_data[index]; - return tmp; -#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -269,7 +273,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -288,16 +292,17 @@ struct TensorEvaluator TensorBlockReader::Run(block, m_data); } - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits::template MakePointer::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device m_device; }; @@ -310,16 +315,6 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() @@ -331,10 +326,26 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -354,13 +365,14 @@ struct TensorEvaluator, Device> PacketType::size); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif private: const NullaryOp m_functor; @@ -401,14 +413,15 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; - + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int NumDims = internal::array_size::value; typedef internal::TensorBlock TensorBlock; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } @@ -456,16 +469,18 @@ struct TensorEvaluator, Device> arg_block.data()); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif private: - const Device& m_device; + const Device m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -509,6 +524,8 @@ struct TensorEvaluator::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; static const int NumDims = internal::array_size< typename TensorEvaluator::Dimensions>::value; @@ -524,7 +541,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif private: - const Device& m_device; + const Device m_device; const BinaryOp m_functor; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; @@ -639,6 +657,8 @@ struct TensorEvaluator::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -646,7 +666,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif private: const TernaryOp m_functor; @@ -731,6 +753,8 @@ struct TensorEvaluator typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -738,7 +762,7 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -757,13 +781,15 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -773,14 +799,16 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7b5842571..47e9b24ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -442,12 +442,133 @@ EIGEN_STRONG_INLINE void TensorExecutor -class TensorExecutor { -public: - static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); +template +struct ExecExprFunctorKernel_impl { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } +}; + +template +struct ExecExprFunctorKernel_impl { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + if (gId < vectorizable_threads) { + const Index PacketSize = Eigen::internal::unpacket_traits< + typename Evaluator::PacketReturnType>::size; + evaluator.evalPacket(gId * PacketSize); + gId += (vectorizable_threads * PacketSize); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += vectorizable_threads) { + evaluator.evalScalar(i); + } + } + } +}; + +template +struct ExecExprFunctorKernel + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable::value, + Evaluator> { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable::value, + Evaluator>(range_, vectorizable_threads_, evaluator) {} +}; + +template +struct ExecExprFunctorKernel + : ExecExprFunctorKernel_impl { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl( + range_, vectorizable_threads_, evaluator) {} +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { + Eigen::TensorEvaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = Eigen::PacketType< + typename Eigen::TensorEvaluator::CoeffReturnType, + Eigen::SyclDevice>::size; + Index vectorizable_threads = + static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + auto f = [&](cl::sycl::handler &cgh) { + evaluator.bind(cgh); + typedef ExecExprFunctorKernel> + conditional_vectorized_kernel; + + typedef ExecExprFunctorKernel> + non_vectorized_kernel; +// This is to make sure that an expression with a size less than vectorized size +// will not call the vectorized kernel. +// The reason for having this kernel is that the vectorisable parameter is a +// compile-time parameter, +// however, the size of a tensor is a run-time parameter + (vectorizable_threads) + ? cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + conditional_vectorized_kernel(range, vectorizable_threads, + evaluator)) + : cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + non_vectorized_kernel(range, vectorizable_threads, + evaluator)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); + dev.async_synchronize(e); + } + evaluator.cleanup(); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 4b6540c07..c9bccfc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -89,7 +89,10 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename TypeConversion::type PointerType; + typedef typename TypeConversion::type + PointerType; }; template @@ -164,9 +167,10 @@ struct traits > static const int Layout = XprTraits::Layout; typedef typename TypeConversion::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type PointerType; + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; @@ -245,9 +249,10 @@ struct traits::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type PointerType; + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; enum { Flags = 0 }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 480cf1f39..8d1a6d9cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -131,6 +131,8 @@ struct TensorEvaluator, D typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -167,13 +169,13 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); return false; } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); + m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); evalToBuf(m_data); return true; } @@ -202,11 +204,16 @@ struct TensorEvaluator, D return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { const bool write_to_out = internal::is_same::value; ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); @@ -576,12 +583,12 @@ struct TensorEvaluator, D protected: Index m_size; - const FFT& m_fft; + const FFT EIGEN_DEVICE_REF m_fft; Dimensions m_dimensions; array m_strides; TensorEvaluator m_impl; - CoeffReturnType* m_data; - const Device& m_device; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 74b905329..e7b7c1e6b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -78,9 +78,10 @@ class TensorForcedEvalOp : public TensorBase, ReadOn }; -template -struct TensorEvaluator, Device> +template +struct TensorEvaluator, Device> { + typedef const typename internal::remove_all::type ArgType; typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -88,6 +89,9 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -106,8 +110,8 @@ struct TensorEvaluator, Device> TensorBlockReader; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + : m_impl(op.expression(), device), m_op(op.expression()), + m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } @@ -115,17 +119,19 @@ struct TensorEvaluator, Device> #if !defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC #endif - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + #ifndef EIGEN_USE_SYCL // Should initialize the memory in case we're dealing with non POD types. if (NumTraits::RequireInitialization) { for (Index i = 0; i < numValues; ++i) { new(m_buffer+i) CoeffReturnType(); } } + #endif typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); + EvalTo evalToTmp(m_device.get(m_buffer), m_op); const bool Vectorize = internal::IsVectorizable::value; internal::TensorExecutor::type, Vectorize>::run(evalToTmp, m_device); return true; @@ -159,17 +165,20 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Eigen::internal::traits::PointerType data() const { return m_buffer; } + EvaluatorPointerType data() const { return m_buffer; } - /// required by sycl in order to extract the sycl accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_buffer.bind(cgh); + m_impl.bind(cgh); + } +#endif private: TensorEvaluator m_impl; const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_buffer; + const Device m_device; + EvaluatorPointerType m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 09b7c994b..ff8a19f87 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,17 +20,20 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; }; -// The PointerType class is a container of the device specefic pointer -// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression +template +EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression // is a device-agnostic type and need MakePointer class for type conversion, -// the TensorEvaluator calls can be specialized for a device, hence it is possible +// the TensorEvaluator class can be specialized for a device, hence it is possible // to construct different types of temproray storage memory in TensorEvaluator -// for different devices by specializing the following PointerType class. -template struct PointerType : MakePointer{}; +// for different devices by specializing the following StorageMemory class. +template struct StorageMemory: MakePointer {}; namespace internal{ template struct Pointer_type_promotion { @@ -39,24 +42,10 @@ template struct Pointer_type_promotion { template struct Pointer_type_promotion { static const bool val = true; }; -template struct TypeConversion; -#ifndef __SYCL_DEVICE_ONLY__ -template struct TypeConversion{ +template struct TypeConversion { typedef A* type; }; -#endif -} - -#if defined(EIGEN_USE_SYCL) -namespace TensorSycl { -namespace internal{ -template class ReductionFunctor; -template -class FullReductionKernelFunctor; -} } -#endif - template class MakePointer_ = MakePointer> class TensorMap; @@ -113,6 +102,31 @@ struct ThreadPoolDevice; struct GpuDevice; struct SyclDevice; +#ifdef EIGEN_USE_SYCL + +template struct MakeSYCLPointer { + typedef Eigen::TensorSycl::internal::RangeAccess Type; +}; + +template +EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& +constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { + return data; +} + +template +struct StorageMemory : MakeSYCLPointer {}; +template +struct StorageMemory : StorageMemory {}; + +namespace TensorSycl { +namespace internal{ +template class ReductionFunctor; +} +} +#endif + + enum FFTResultType { RealPart = 0, ImagPart = 1, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 51572f9e7..2edc45f1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -421,6 +421,7 @@ class GaussianGenerator { const array& std_devs) : m_means(means) { + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; } @@ -428,6 +429,7 @@ class GaussianGenerator { EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { T tmp = T(0); + EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 204a6fd33..b7ad33626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -88,6 +88,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = (PacketType::size > 1), @@ -104,22 +106,21 @@ struct TensorEvaluator, Device> TensorBlock; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), m_generator(op.generator()) -#ifdef EIGEN_USE_SYCL - , m_argImpl(op.expression(), device) -#endif + : m_device(device), m_generator(op.generator()) { TensorEvaluator argImpl(op.expression(), device); m_dimensions = argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; + EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); @@ -129,7 +130,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -234,11 +235,11 @@ struct TensorEvaluator, Device> TensorOpCost::MulCost()); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} #endif protected: @@ -261,14 +262,11 @@ struct TensorEvaluator, Device> } } - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; array m_strides; array m_fast_strides; Generator m_generator; -#ifdef EIGEN_USE_SYCL - TensorEvaluator m_argImpl; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 965bd8f1e..5ff67bdae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -154,23 +154,6 @@ class TensorImagePatchOp : public TensorBase, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -256,15 +241,8 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock OutputTensorBlock; -#ifdef __SYCL_DEVICE_ONLY__ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) - #else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) - #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_op(op) -#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -410,7 +388,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -516,13 +494,15 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } #ifdef EIGEN_USE_SYCL - // Required by SYCL in order to construct the expression tree on the device - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -693,6 +673,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -744,12 +725,8 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator m_impl; - #ifdef EIGEN_USE_SYCL - // Required for SYCL in order to construct the expression tree on the device - XprType m_op; - #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 32caccf87..be8f3a734 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -353,6 +353,7 @@ namespace internal { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { Index result = 1; + EIGEN_UNROLL_LOOP for (size_t i = 0; i < array_size >::value; ++i) { result *= sizes[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index e28565009..f8cda6574 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -86,6 +86,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, @@ -131,7 +133,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -146,6 +148,7 @@ struct TensorEvaluator, Device> eigen_assert(index < dimensions().TotalSize()); *inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -160,6 +163,7 @@ struct TensorEvaluator, Device> *inputIndex += index / m_strides[0]; return true; } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -195,6 +199,7 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -215,11 +220,13 @@ struct TensorEvaluator, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index b6d445c50..e7fec5d3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -37,7 +37,7 @@ namespace { { #ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; @@ -55,8 +55,8 @@ namespace { { #ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -92,7 +92,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { #if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::mul_hi(a, static_cast(b)); #else return (static_cast(a) * b) >> 32; @@ -103,7 +103,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) +#elif defined(SYCL_DEVICE_ONLY) return cl::sycl::mul_hi(a, static_cast(b)); #elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -205,8 +205,8 @@ class TensorIntDivisor { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); -#elif defined(__SYCL_DEVICE_ONLY__) - return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 998757d14..755170a34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -134,13 +134,22 @@ struct TensorEvaluator, Device> } } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -162,7 +171,9 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } const TensorEvaluator& impl() const { return m_impl; } @@ -192,7 +203,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - + typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index c6ca396a3..3d859f42d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -59,4 +59,39 @@ #define EIGEN_SLEEP(n) sleep(std::max(1, n/1000)) #endif +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) + #define EIGEN_DEVICE_REF +#else + #define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#if defined(EIGEN_EXCEPTIONS) + #define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + try { X; } \ + catch(const cl::sycl::exception& e) { \ + std::cerr << "SYCL exception at " \ + << __FILE__ << ":" << __LINE__ << std::endl \ + << e.what() << std::endl; \ + std::rethrow_exception(std::current_exception()); \ + } \ + } while (false) +#else + #define EIGEN_SYCL_TRY_CATCH(X) X +#endif + +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 28f629080..395cdf9c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -31,8 +31,12 @@ template class MakePoin public: typedef TensorMap Self; typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; + #ifdef EIGEN_USE_SYCL + typedef typename Eigen::internal::remove_reference::type>::type Nested; + #else + typedef typename Eigen::internal::nested::type Nested; + #endif + typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 87be090f9..6afc98877 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -85,9 +85,57 @@ struct PacketType { #endif #if defined(EIGEN_USE_SYCL) -template - struct PacketType { - typedef T type; + +namespace TensorSycl { +namespace internal { + +template struct PlusOp { + static constexpr Index Value = A + B; +}; + +template struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, + StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static const int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static const int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { + return ((((x) + (y)-1) / (y)) * (y)); +} + +} // namespace internal +} // namespace TensorSycl + +template <> + struct PacketType { + typedef half type; static const int size = 1; enum { HasAdd = 0, @@ -104,9 +152,59 @@ template HasBlend = 0 }; }; -#endif +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; + +}; + +template +struct PacketType : PacketType{}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ +template<> struct PacketType : internal::sycl_packet_traits \ +{\ + typedef typename internal::packet_traits::type type;\ + typedef typename internal::packet_traits::half half;\ +}; +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) + +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template<> struct PacketType: PacketType{}; +template<> struct PacketType: PacketType{}; +#endif +#endif + // Tuple mimics std::pair but works on e.g. nvcc. template struct Tuple { public: @@ -124,7 +222,7 @@ template struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { - #ifndef __SYCL_DEVICE_ONLY__ + #ifndef SYCL_DEVICE_ONLY if (&rhs == this) return *this; #endif first = rhs.first; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 5352c8f7b..8f6e987b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -37,7 +37,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOpEIGEN_DEVICE_REF type; }; template @@ -106,6 +106,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory::type, Device> ConstCastStorage; static const int NumOutputDims = internal::array_size::value; static const int NumInputDims = internal::array_size::Dimensions>::value; @@ -168,7 +171,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -326,10 +329,18 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -404,7 +415,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorSlicingOp& type; + typedef const TensorSlicingOpEIGEN_DEVICE_REF type; }; template @@ -488,7 +499,7 @@ template struct MemcpyTriggerForSlicing { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_SYCL -template struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; @@ -508,6 +519,9 @@ struct TensorEvaluator, Devi typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -575,7 +589,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits::type>::RequireInitialization && data && m_impl.data() @@ -599,10 +613,10 @@ struct TensorEvaluator, Devi // Use memcpy if it's going to be faster than using the regular evaluation. const MemcpyTriggerForSlicing trigger(m_device); if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -637,6 +651,7 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -648,6 +663,7 @@ struct TensorEvaluator, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -667,6 +683,7 @@ struct TensorEvaluator, Devi EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -698,8 +715,8 @@ struct TensorEvaluator, Devi m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { - Scalar* result = const_cast(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -733,19 +750,19 @@ struct TensorEvaluator, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } +#endif + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -753,6 +770,7 @@ struct TensorEvaluator, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -767,7 +785,7 @@ struct TensorEvaluator, Devi array, NumDims> m_fastOutputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; @@ -829,6 +847,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -840,6 +859,7 @@ struct TensorEvaluator, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -859,6 +879,7 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } @@ -892,7 +913,7 @@ struct traits struct eval, Eigen::Dense> { - typedef const TensorStridingSlicingOp& type; + typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; }; template @@ -969,6 +990,8 @@ struct TensorEvaluator::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; typedef Strides Dimensions; enum { @@ -985,8 +1008,7 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; @@ -1069,7 +1091,7 @@ struct TensorEvaluator::PointerType data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{return m_impl;} - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -1125,7 +1145,7 @@ struct TensorEvaluator m_inputStrides; bool m_is_identity; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes m_startIndices; // clamped startIndices DSizes m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 4837f2200..e98382cc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -92,6 +92,8 @@ struct TensorEvaluator, Device typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = true, @@ -138,7 +140,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -151,6 +153,7 @@ struct TensorEvaluator, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (isPaddingAtIndexForDim(idx, i)) { @@ -164,6 +167,7 @@ struct TensorEvaluator, Device } inputIndex += (index - m_padding[0].first); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (isPaddingAtIndexForDim(idx, i)) { @@ -192,23 +196,25 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); } else { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); } return cost; } - EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{return m_impl;} +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif private: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( @@ -272,6 +278,7 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index firstIdx = index; const Index lastIdx = index + PacketSize - 1; @@ -329,7 +336,7 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; - + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index firstIdx = index; const Index lastIdx = index + PacketSize - 1; @@ -383,6 +390,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 4292fe0c2..47db839db 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -89,6 +89,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { @@ -103,9 +105,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_patch_dims(op.patch_dims()) -#endif { Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -149,7 +148,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -167,6 +166,7 @@ struct TensorEvaluator, Device> Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -175,6 +175,7 @@ struct TensorEvaluator, Device> inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -202,6 +203,7 @@ struct TensorEvaluator, Device> Index inputIndices[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -217,6 +219,7 @@ struct TensorEvaluator, Device> inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -243,6 +246,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -259,11 +263,13 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } #endif protected: @@ -274,9 +280,6 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; -#ifdef EIGEN_USE_SYCL - const PatchDim m_patch_dims; -#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 787cbd031..2be4f9cc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -44,6 +45,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { uint64_t rnd = ::random() ^ mach_absolute_time(); return rnd; + #else // Augment the current time with pseudo random number generation // to ensure that we get different seeds if we try to generate seeds @@ -147,14 +149,41 @@ template class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two step to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; + #ifdef EIGEN_USE_SYCL + m_exec_once =other.m_exec_once; + #endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif T result = RandomToTypeUniform(&m_state, i); return result; } @@ -163,6 +192,14 @@ template class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { values[j] = RandomToTypeUniform(&m_state, i); } @@ -171,6 +208,9 @@ template class UniformRandomGenerator { private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; template @@ -222,14 +262,37 @@ template class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once=other.m_exec_once; +#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif T result = RandomToTypeNormal(&m_state, i); return result; } @@ -238,6 +301,14 @@ template class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP for (int j = 0; j < packetSize; ++j) { values[j] = RandomToTypeNormal(&m_state, i); } @@ -246,6 +317,9 @@ template class NormalRandomGenerator { private: mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index bb63433fe..5dddfcf85 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -299,7 +299,7 @@ template ::reduce(self, 0, num_coeffs, reducer); } @@ -400,6 +400,18 @@ struct OuterReducer { } }; +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template @@ -423,6 +435,23 @@ template __global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if EIGEN_HAS_CXX11 && defined(EIGEN_USE_SYCL) + typedef typename remove_const().initialize())>::type type; +#else + typedef typename remove_const::type type; +#endif +}; + template @@ -520,12 +549,15 @@ class TensorReductionOp : public TensorBase +struct TensorReductionEvaluatorBase; // Eval as rvalue template class MakePointer_, typename Device> -struct TensorEvaluator, Device> +struct TensorReductionEvaluatorBase, Device> { typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -535,12 +567,20 @@ struct TensorEvaluator, static const int NumOutputDims = NumInputDims - NumReducedDims; typedef typename internal::conditional, DSizes >::type Dimensions; typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator, Device> Self; + typedef TensorReductionEvaluatorBase, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::ReductionReturnType::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const Index PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; + enum { IsAligned = false, PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, @@ -562,11 +602,8 @@ struct TensorEvaluator, static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) -#if defined(EIGEN_USE_SYCL) - , m_xpr_dims(op.dims()) -#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -653,7 +690,7 @@ struct TensorEvaluator, // of which will eventually result in an NVCC error EIGEN_DEVICE_FUNC #endif - bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -663,7 +700,7 @@ struct TensorEvaluator, !RunningOnGPU))) { bool need_assign = false; if (!data) { - m_result = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType))); + m_result = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); data = m_result; need_assign = true; } @@ -671,20 +708,9 @@ struct TensorEvaluator, internal::FullReducer::run(*this, reducer, m_device, data); return need_assign; } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -698,8 +724,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -707,6 +733,7 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { m_device.deallocate_temp(m_result); @@ -731,8 +758,8 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); m_result = data; } else { @@ -740,6 +767,7 @@ struct TensorEvaluator, } } Op reducer(m_reducer); + // For SYCL this if always return false if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { m_device.deallocate_temp(m_result); @@ -750,6 +778,21 @@ struct TensorEvaluator, return (m_result != NULL); } } + #if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if(RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + #endif } return true; } @@ -764,7 +807,7 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { + if (( RunningFullReduction || RunningOnGPU) && m_result ) { return *(m_result + index); } Op reducer(m_reducer); @@ -1097,12 +1140,15 @@ struct TensorEvaluator, m_device.deallocate(reducers); } - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - -#if defined(EIGEN_USE_SYCL) - const TensorEvaluator& impl() const { return m_impl; } - const Device& device() const { return m_device; } - const Dims& xprDims() const { return m_xpr_dims; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_result.bind(cgh); + } #endif private: @@ -1126,8 +1172,9 @@ struct TensorEvaluator, #endif #if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - template friend class TensorSycl::internal::FullReductionKernelFunctor; + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template friend struct internal::GenericReducer; #endif @@ -1255,9 +1302,6 @@ struct TensorEvaluator, // Precomputed strides for the output tensor. array m_outputStrides; array, NumOutputDims> m_fastOutputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; // Map from output to input dimension index. array m_output_to_input_dim_map; @@ -1288,13 +1332,36 @@ static const bool RunningOnGPU = false; static const bool RunningOnGPU = false; static const bool RunningOnSycl = false; #endif - typename MakePointer_::Type m_result; + EvaluatorPointerType m_result; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; +}; -#if defined(EIGEN_USE_SYCL) - const Dims m_xpr_dims; -#endif +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +: public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} +}; + + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> +: public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 6e15e75f9..b92c9ffaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator::Dimensions Dimensions; typedef typename TensorEvaluator::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef TensorEvaluator EvalType; TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { m_dims = m_impl.dimensions(); @@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly Base; typedef typename Base::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { } @@ -362,6 +367,8 @@ struct TensorEvaluator, Device> typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -379,7 +386,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } @@ -394,7 +401,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); } - + protected: TensorRef m_ref; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index b7fb969f3..3a699095c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -109,6 +109,8 @@ struct TensorEvaluator, Device typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -145,7 +147,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -158,6 +160,7 @@ struct TensorEvaluator, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { Index idx = index / m_strides[i]; index -= idx * m_strides[i]; @@ -172,6 +175,7 @@ struct TensorEvaluator, Device inputIndex += index; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { Index idx = index / m_strides[i]; index -= idx * m_strides[i]; @@ -205,6 +209,7 @@ struct TensorEvaluator, Device // local structure. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -225,12 +230,14 @@ struct TensorEvaluator, Device TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from sycl device - ReverseDimensions functor() const { return m_reverse; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: Dimensions m_dimensions; @@ -269,7 +276,7 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -285,11 +292,11 @@ struct TensorEvaluator, Device> // This code is pilfered from TensorMorphing.h EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 64f10d0a4..44156126d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -86,12 +86,15 @@ struct TensorEvaluator, Device> { typedef TensorScanOp XprType; typedef typename XprType::Index Index; + typedef const ArgType ChildType; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename internal::remove_const::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef TensorEvaluator, Device> Self; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -110,7 +113,7 @@ struct TensorEvaluator, Device> { m_exclusive(op.exclusive()), m_accumulator(op.accumulator()), m_size(m_impl.dimensions()[op.axis()]), - m_stride(1), + m_stride(1), m_consume_dim(op.axis()), m_output(NULL) { // Accumulating a scalar isn't supported. @@ -142,6 +145,10 @@ struct TensorEvaluator, Device> { return m_stride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { + return m_consume_dim; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; } @@ -162,7 +169,7 @@ struct TensorEvaluator, Device> { return m_device; } - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); ScanLauncher launcher; if (data) { @@ -171,7 +178,7 @@ struct TensorEvaluator, Device> { } const Index total_size = internal::array_prod(dimensions()); - m_output = static_cast(m_device.allocate(total_size * sizeof(Scalar))); + m_output = static_cast(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar)))); launcher(*this, m_output); return true; } @@ -181,7 +188,7 @@ struct TensorEvaluator, Device> { return internal::ploadt(m_output + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; } @@ -196,21 +203,29 @@ struct TensorEvaluator, Device> { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_output != NULL) { - m_device.deallocate(m_output); + if (m_output) { + m_device.deallocate_temp(m_output); m_output = NULL; } m_impl.cleanup(); } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_output.bind(cgh); + } +#endif protected: TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; const bool m_exclusive; Op m_accumulator; const Index m_size; Index m_stride; - CoeffReturnType* m_output; + Index m_consume_dim; + EvaluatorPointerType m_output; }; // CPU implementation of scan diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 416948765..ae04785ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -109,6 +109,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -130,8 +132,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), - m_impl(op.expression(), device), - m_shuffle(op.shufflePermutation()) + m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -172,7 +173,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -194,6 +195,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = self.coeff(index + i); } @@ -210,6 +212,7 @@ struct TensorEvaluator, Device> return self.m_impl.template packet(index); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = self.coeff(index + i); } @@ -330,13 +333,14 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } - - // required by sycl - EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} - // required by sycl - EIGEN_STRONG_INLINE const TensorEvaluator& impl() const {return m_impl;} + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( Index input_index, @@ -389,10 +393,8 @@ struct TensorEvaluator, Device> array m_inputStrides; array m_unshuffledInputStrides; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; TensorEvaluator m_impl; - /// required by sycl - Shuffle m_shuffle; }; @@ -444,6 +446,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 221dc96c9..3b1cbaabc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -37,7 +37,7 @@ struct traits > : public traits template struct eval, Eigen::Dense> { - typedef const TensorStridingOp& type; + typedef const TensorStridingOpEIGEN_DEVICE_REF type; }; template @@ -108,6 +108,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/false, @@ -120,7 +122,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) + : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { @@ -149,9 +151,10 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -173,6 +176,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -184,6 +188,7 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * m_inputStrides[0]; inputIndices[1] += indices[1] * m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -203,6 +208,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -225,18 +231,20 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - Strides functor() const { return m_strides; } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -244,6 +252,7 @@ struct TensorEvaluator, Device> } inputIndex += index * m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -258,7 +267,6 @@ struct TensorEvaluator, Device> array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Strides m_strides; }; // Eval as lvalue @@ -296,11 +304,6 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return this->m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -310,6 +313,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; @@ -321,6 +325,7 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * this->m_inputStrides[0]; inputIndices[1] += indices[1] * this->m_inputStrides[0]; } else { // RowMajor + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; @@ -340,6 +345,7 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 9fc54a4c0..d04b1bea7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -91,6 +91,8 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -205,7 +207,7 @@ struct TensorEvaluator, Device> return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -249,6 +251,13 @@ struct TensorEvaluator, Device> return result; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: // Given the output index, finds the first index in the input tensor used to compute the trace EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -276,7 +285,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; // Initialize the size of the trace dimension Index m_traceDim; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; array m_reduced; array m_reducedDims; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 0a394c88d..4aec83cff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -58,9 +58,6 @@ struct traits > }; template struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; - }; typedef typename MakePointer::Type PointerType; }; @@ -80,9 +77,6 @@ struct traits > }; template struct MakePointer { typedef T* Type; - typedef T& RefType; - typedef T ScalarType; - }; typedef typename MakePointer::Type PointerType; }; @@ -106,10 +100,6 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - typedef typename MakePointerT::ScalarType ScalarType; - - }; typedef typename MakePointer::Type PointerType; }; @@ -135,49 +125,49 @@ struct traits > template struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; }; template struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; }; template struct eval, Eigen::Dense> { - typedef const TensorFixedSize& type; + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; }; template struct eval, Eigen::Dense> { - typedef const TensorFixedSize& type; + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; }; template class MakePointer> struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMapEIGEN_DEVICE_REF type; }; template class MakePointer> struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMapEIGEN_DEVICE_REF type; }; template struct eval, Eigen::Dense> { - typedef const TensorRef& type; + typedef const TensorRefEIGEN_DEVICE_REF type; }; template struct eval, Eigen::Dense> { - typedef const TensorRef& type; + typedef const TensorRefEIGEN_DEVICE_REF type; }; // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. @@ -189,50 +179,50 @@ template struct nested template struct nested > { - typedef const Tensor& type; + typedef const TensorEIGEN_DEVICE_REF type; }; template struct nested > { - typedef const Tensor& type; + typedef const TensorEIGEN_DEVICE_REF type; }; template struct nested > { - typedef const TensorFixedSize& type; + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; }; template struct nested > { - typedef const TensorFixedSize& type; + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; }; template class MakePointer> struct nested > { - typedef const TensorMap& type; + typedef const TensorMapEIGEN_DEVICE_REF type; }; template class MakePointer> struct nested > { - typedef const TensorMap& type; + typedef const TensorMapEIGEN_DEVICE_REF type; }; template struct nested > { - typedef const TensorRef& type; + typedef const TensorRefEIGEN_DEVICE_REF type; }; template struct nested > { - typedef const TensorRef& type; + typedef const TensorRefEIGEN_DEVICE_REF type; }; } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index c1b7a58ca..29a2d5538 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -91,24 +91,6 @@ class TensorVolumePatchOp : public TensorBase, D typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, @@ -205,16 +189,9 @@ struct TensorEvaluator, D CoordAccess = false, RawAccess = false }; -#ifdef __SYCL_DEVICE_ONLY__ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) -#else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) -#endif - : m_impl(op.expression(), device) -#ifdef EIGEN_USE_SYCL - , m_op(op) -#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : + m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -368,9 +345,10 @@ struct TensorEvaluator, D m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -531,14 +509,10 @@ struct TensorEvaluator, D return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } -#ifdef EIGEN_USE_SYCL - // Required by SYCL in order to construct the expression on the device - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } -#endif Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -556,10 +530,17 @@ struct TensorEvaluator, D Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -635,10 +616,6 @@ struct TensorEvaluator, D TensorEvaluator m_impl; -#ifdef EIGEN_USE_SYCL -// Required by SYCL in order to construct the expression on the device - XprType m_op; -#endif }; diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 162dab7b8..8ea03382d 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -527,26 +527,32 @@ static void test_execute_generator_op(Device d) } } +#ifdef EIGEN_DONT_VECTORIZE +#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL +#else +#define VECTORIZABLE(VAL) VAL +#endif + #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))) + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))) EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::DefaultDevice default_device; diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 4cbe15b63..eb708737d 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -51,7 +51,8 @@ static void test_static_reshape() { // New dimensions: [2, 3, 7] Eigen::IndexList, type2index<3>, type2index<7>> dim; - Tensor reshaped = tensor.reshape(dim); + Tensor reshaped = tensor.reshape(static_cast>(dim)); + for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { -- cgit v1.2.3