From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 28 Jun 2019 10:08:23 +0100 Subject: [SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL. * Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes. --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 94 +++++++++++++--------- 1 file changed, 55 insertions(+), 39 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 5352c8f7b..8f6e987b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -37,7 +37,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOpEIGEN_DEVICE_REF type; }; template @@ -106,6 +106,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory::type, Device> ConstCastStorage; static const int NumOutputDims = internal::array_size::value; static const int NumInputDims = internal::array_size::Dimensions>::value; @@ -168,7 +171,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -326,10 +329,18 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -404,7 +415,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorSlicingOp& type; + typedef const TensorSlicingOpEIGEN_DEVICE_REF type; }; template @@ -488,7 +499,7 @@ template struct MemcpyTriggerForSlicing { // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_SYCL -template struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; @@ -508,6 +519,9 @@ struct TensorEvaluator, Devi typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -575,7 +589,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits::type>::RequireInitialization && data && m_impl.data() @@ -599,10 +613,10 @@ struct TensorEvaluator, Devi // Use memcpy if it's going to be faster than using the regular evaluation. const MemcpyTriggerForSlicing trigger(m_device); if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); } return false; } @@ -637,6 +651,7 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -648,6 +663,7 @@ struct TensorEvaluator, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -667,6 +683,7 @@ struct TensorEvaluator, Devi EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -698,8 +715,8 @@ struct TensorEvaluator, Devi m_impl.block(&input_block); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { - Scalar* result = const_cast(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); if (result) { Index offset = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -733,19 +750,19 @@ struct TensorEvaluator, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); } +#endif + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -753,6 +770,7 @@ struct TensorEvaluator, Devi } inputIndex += (index + m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -767,7 +785,7 @@ struct TensorEvaluator, Devi array, NumDims> m_fastOutputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; bool m_is_identity; const StartIndices m_offsets; @@ -829,6 +847,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -840,6 +859,7 @@ struct TensorEvaluator, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -859,6 +879,7 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } @@ -892,7 +913,7 @@ struct traits struct eval, Eigen::Dense> { - typedef const TensorStridingSlicingOp& type; + typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; }; template @@ -969,6 +990,8 @@ struct TensorEvaluator::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; typedef Strides Dimensions; enum { @@ -985,8 +1008,7 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; @@ -1069,7 +1091,7 @@ struct TensorEvaluator::PointerType data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } - - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{return m_impl;} - +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { + EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -1125,7 +1145,7 @@ struct TensorEvaluator m_inputStrides; bool m_is_identity; TensorEvaluator m_impl; - const Device& m_device; + const Device EIGEN_DEVICE_REF m_device; DSizes m_startIndices; // clamped startIndices DSizes m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue -- cgit v1.2.3