aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
commit00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree792e46110f0751ea8802fa9d403d1472d5977ac3 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parentea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h150
1 files changed, 47 insertions, 103 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 9926046b9..b83174ab7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
// SYCL Executor policy
#ifdef EIGEN_USE_SYCL
-template <bool Vectorizable, typename Evaluator>
-struct ExecExprFunctorKernel_impl {
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
typedef typename Evaluator::Index Index;
- const Index range;
- const Index vectorizable_threads;
Evaluator evaluator;
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
- const Index range_, const Index vectorizable_threads_,
- Evaluator evaluator_)
- : range(range_), vectorizable_threads(vectorizable_threads_),
- evaluator(evaluator_) {}
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
- operator()(cl::sycl::nd_item<1> itemID) {
+ const Index range;
+ template <typename Scratch>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+ const Scratch, Evaluator evaluator_, const Index range_)
+ : evaluator(evaluator_), range(range_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+ cl::sycl::nd_item<1> itemID) {
+ compute(itemID);
+ }
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
- EIGEN_UNROLL_LOOP
+
for (Index i = gId; i < range; i += total_threads) {
evaluator.evalScalar(i);
}
}
-};
-
-template <typename Evaluator>
-struct ExecExprFunctorKernel_impl<true, Evaluator> {
- typedef typename Evaluator::Index Index;
- const Index range;
- const Index vectorizable_threads;
- Evaluator evaluator;
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
- const Index range_, const Index vectorizable_threads_,
- Evaluator evaluator_)
- : range(range_), vectorizable_threads(vectorizable_threads_),
- evaluator(evaluator_) {}
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
- operator()(cl::sycl::nd_item<1> itemID) {
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
+ const Index vectorizedRange =
+ (range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());
- if (gId < vectorizable_threads) {
- const Index PacketSize = Eigen::internal::unpacket_traits<
- typename Evaluator::PacketReturnType>::size;
- evaluator.evalPacket(gId * PacketSize);
- gId += (vectorizable_threads * PacketSize);
- EIGEN_UNROLL_LOOP
- for (Index i = gId; i < range; i += vectorizable_threads) {
- evaluator.evalScalar(i);
- }
+ const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ const Index start = Evaluator::PacketSize * gId;
+ for (Index i = start; i < vectorizedRange; i += step) {
+ evaluator.evalPacket(i);
+ }
+ gId += vectorizedRange;
+ for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+ evaluator.evalScalar(i);
}
}
};
-template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
-struct ExecExprFunctorKernel
- : ExecExprFunctorKernel_impl<
- ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
- Evaluator> {
- ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
- const Evaluator &evaluator)
- : ExecExprFunctorKernel_impl<
- ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
- Evaluator>(range_, vectorizable_threads_, evaluator) {}
-};
-
-template <typename Expr, typename Evaluator>
-struct ExecExprFunctorKernel<Expr, false, Evaluator>
- : ExecExprFunctorKernel_impl<false, Evaluator> {
- ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
- const Evaluator &evaluator)
- : ExecExprFunctorKernel_impl<false, Evaluator>(
- range_, vectorizable_threads_, evaluator) {}
-};
-
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
- public:
+ public:
typedef typename Expression::Index Index;
- static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const Eigen::SyclDevice& dev) {
+ typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+ Evaluator evaluator(expr, dev);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
Index range, GRange, tileSize;
Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
total_size = (total_size == 0) ? 1 : total_size;
- const int PacketSize = Eigen::PacketType<
- typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
- Eigen::SyclDevice>::size;
- Index vectorizable_threads =
- static_cast<Index>(total_size / PacketSize);
+ const int PacketSize =
+ Eigen::PacketType<typename Evaluator::CoeffReturnType,
+ Eigen::SyclDevice>::size;
+ Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
range = total_size;
- auto f = [&](cl::sycl::handler &cgh) {
- evaluator.bind(cgh);
- typedef ExecExprFunctorKernel<Expression, true,
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
- conditional_vectorized_kernel;
-
- typedef ExecExprFunctorKernel<Expression, false,
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
- non_vectorized_kernel;
-// This is to make sure that an expression with a size less than vectorized size
-// will not call the vectorized kernel.
-// The reason for having this kernel is that the vectorisable parameter is a
-// compile-time parameter,
-// however, the size of a tensor is a run-time parameter
- (vectorizable_threads)
- ? cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
- dev.program().template get_kernel<vectorized_kernel>(),
-#endif
- cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
- cl::sycl::range<1>(tileSize)),
- conditional_vectorized_kernel(range, vectorizable_threads,
- evaluator))
- : cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
- dev.program().template get_kernel<non_vectorized_kernel>(),
-#endif
- cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
- cl::sycl::range<1>(tileSize)),
- non_vectorized_kernel(range, vectorizable_threads,
- evaluator));
- };
- cl::sycl::event e;
- EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
- dev.async_synchronize(e);
+
+ dev.template nullary_kernel_launcher<
+ typename Evaluator::CoeffReturnType,
+ ExecExprFunctorKernel<Evaluator> >(
+ evaluator,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+ cl::sycl::range<1>(tileSize)),
+ Index(1), range);
}
evaluator.cleanup();
}