From 7d08fa805a38f9ebb9e0e487c4e2d23d32a0fcde Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 28 Jun 2019 10:08:23 +0100 Subject: [SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL. * Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 133 ++++++++++++++++++++- 1 file changed, 127 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7b5842571..47e9b24ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -442,12 +442,133 @@ EIGEN_STRONG_INLINE void TensorExecutor -class TensorExecutor { -public: - static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); +template +struct ExecExprFunctorKernel_impl { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } +}; + +template +struct ExecExprFunctorKernel_impl { + typedef typename Evaluator::Index Index; + const Index range; + const Index vectorizable_threads; + Evaluator evaluator; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( + const Index range_, const Index vectorizable_threads_, + Evaluator evaluator_) + : range(range_), vectorizable_threads(vectorizable_threads_), + evaluator(evaluator_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void + operator()(cl::sycl::nd_item<1> itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + if (gId < vectorizable_threads) { + const Index PacketSize = Eigen::internal::unpacket_traits< + typename Evaluator::PacketReturnType>::size; + evaluator.evalPacket(gId * PacketSize); + gId += (vectorizable_threads * PacketSize); + EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += vectorizable_threads) { + evaluator.evalScalar(i); + } + } + } +}; + +template +struct ExecExprFunctorKernel + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable::value, + Evaluator> { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl< + ::Eigen::internal::IsVectorizable::value, + Evaluator>(range_, vectorizable_threads_, evaluator) {} +}; + +template +struct ExecExprFunctorKernel + : ExecExprFunctorKernel_impl { + ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, + const Evaluator &evaluator) + : ExecExprFunctorKernel_impl( + range_, vectorizable_threads_, evaluator) {} +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { + Eigen::TensorEvaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = Eigen::PacketType< + typename Eigen::TensorEvaluator::CoeffReturnType, + Eigen::SyclDevice>::size; + Index vectorizable_threads = + static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + auto f = [&](cl::sycl::handler &cgh) { + evaluator.bind(cgh); + typedef ExecExprFunctorKernel> + conditional_vectorized_kernel; + + typedef ExecExprFunctorKernel> + non_vectorized_kernel; +// This is to make sure that an expression with a size less than vectorized size +// will not call the vectorized kernel. +// The reason for having this kernel is that the vectorisable parameter is a +// compile-time parameter, +// however, the size of a tensor is a run-time parameter + (vectorizable_threads) + ? cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + conditional_vectorized_kernel(range, vectorizable_threads, + evaluator)) + : cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + dev.program().template get_kernel(), +#endif + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + non_vectorized_kernel(range, vectorizable_threads, + evaluator)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); + dev.async_synchronize(e); + } + evaluator.cleanup(); } }; -- cgit v1.2.3