From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 150 +++++++-------------- 1 file changed, 47 insertions(+), 103 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 9926046b9..b83174ab7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor -struct ExecExprFunctorKernel_impl { +template +struct ExecExprFunctorKernel { typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { Index gId = static_cast(itemID.get_global_linear_id()); Index total_threads = itemID.get_global_range(0); - EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { evaluator.evalScalar(i); } } -}; - -template -struct ExecExprFunctorKernel_impl { - typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; - Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; Index gId = static_cast(itemID.get_global_linear_id()); - if (gId < vectorizable_threads) { - const Index PacketSize = Eigen::internal::unpacket_traits< - typename Evaluator::PacketReturnType>::size; - evaluator.evalPacket(gId * PacketSize); - gId += (vectorizable_threads * PacketSize); - EIGEN_UNROLL_LOOP - for (Index i = gId; i < range; i += vectorizable_threads) { - evaluator.evalScalar(i); - } + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); } } }; -template -struct ExecExprFunctorKernel - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable::value, - Evaluator> { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable::value, - Evaluator>(range_, vectorizable_threads_, evaluator) {} -}; - -template -struct ExecExprFunctorKernel - : ExecExprFunctorKernel_impl { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl( - range_, vectorizable_threads_, evaluator) {} -}; - template class TensorExecutor { - public: + public: typedef typename Expression::Index Index; - static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { - Eigen::TensorEvaluator evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { Index range, GRange, tileSize; Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); total_size = (total_size == 0) ? 1 : total_size; - const int PacketSize = Eigen::PacketType< - typename Eigen::TensorEvaluator::CoeffReturnType, - Eigen::SyclDevice>::size; - Index vectorizable_threads = - static_cast(total_size / PacketSize); + const int PacketSize = + Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); range = total_size; - auto f = [&](cl::sycl::handler &cgh) { - evaluator.bind(cgh); - typedef ExecExprFunctorKernel> - conditional_vectorized_kernel; - - typedef ExecExprFunctorKernel> - non_vectorized_kernel; -// This is to make sure that an expression with a size less than vectorized size -// will not call the vectorized kernel. -// The reason for having this kernel is that the vectorisable parameter is a -// compile-time parameter, -// however, the size of a tensor is a run-time parameter - (vectorizable_threads) - ? cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - conditional_vectorized_kernel(range, vectorizable_threads, - evaluator)) - : cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - non_vectorized_kernel(range, vectorizable_threads, - evaluator)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); - dev.async_synchronize(e); + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); } evaluator.cleanup(); } -- cgit v1.2.3