diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-06-28 10:08:23 +0100 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-06-28 10:08:23 +0100 |
commit | 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde (patch) | |
tree | fbff4d80b6b373dcd53632de4c1fab5c393bdd64 /unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | |
parent | 16a56b2dddbfaf2d4b81d62be5e3139f12783ac8 (diff) |
[SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL.
* Abstracting the pointer type so that both SYCL memory and pointer can be captured.
* Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class.
* Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node.
* Adding SYCL macro for controlling loop unrolling.
* Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 94 |
1 files changed, 55 insertions, 39 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 5352c8f7b..8f6e987b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -37,7 +37,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp template<typename NewDimensions, typename XprType> struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> { - typedef const TensorReshapingOp<NewDimensions, XprType>& type; + typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type; }; template<typename NewDimensions, typename XprType> @@ -106,6 +106,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; static const int NumOutputDims = internal::array_size<Dimensions>::value; static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; @@ -168,7 +171,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -326,10 +329,18 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator<ArgType, Device> m_impl; NewDimensions m_dimensions; @@ -404,7 +415,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp template<typename StartIndices, typename Sizes, typename XprType> struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> { - typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type; + typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type; }; template<typename StartIndices, typename Sizes, typename XprType> @@ -488,7 +499,7 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_SYCL -template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> { +template <typename Index> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice> { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; @@ -508,6 +519,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef Sizes Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -575,7 +589,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data() @@ -599,10 +613,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi // Use memcpy if it's going to be faster than using the regular evaluation. const MemcpyTriggerForSlicing<Index, Device> trigger(m_device); if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -637,6 +651,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -648,6 +663,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -667,6 +683,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -698,8 +715,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { - Scalar* result = const_cast<Scalar*>(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -733,19 +750,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } +#endif + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -753,6 +770,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -767,7 +785,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; @@ -829,6 +847,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -840,6 +859,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -859,6 +879,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> internal::pstore<CoeffReturnType, PacketReturnType>(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } @@ -892,7 +913,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> { - typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type; + typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type; }; template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> @@ -969,6 +990,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; typedef Strides Dimensions; enum { @@ -985,8 +1008,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), - m_strides(op.strides()), m_exprStartIndices(op.startIndices()), - m_exprStopIndices(op.stopIndices()) + m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped; @@ -1069,7 +1091,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -1091,30 +1113,28 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;} - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -1125,7 +1145,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { -#ifndef __SYCL_DEVICE_ONLY__ +#ifndef SYCL_DEVICE_ONLY return numext::maxi(min, numext::mini(max,value)); #else return cl::sycl::clamp(value, min, max); @@ -1137,15 +1157,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, array<Index, NumDims> m_inputStrides; bool m_is_identity; TensorEvaluator<ArgType, Device> m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes<Index, NumDims> m_startIndices; // clamped startIndices DSizes<Index, NumDims> m_dimensions; DSizes<Index, NumDims> m_offsets; // offset in a flattened shape const Strides m_strides; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue |