aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
commit00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree792e46110f0751ea8802fa9d403d1472d5977ac3 /unsupported/Eigen
parentea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r--unsupported/Eigen/CXX11/Tensor23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h152
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h4
-rwxr-xr-x[-rw-r--r--]unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h1890
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h689
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h245
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h150
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h673
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h512
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h120
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h205
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h514
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h310
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h467
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h248
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h213
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h302
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h96
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h239
22 files changed, 3229 insertions, 3831 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 6a8dc2cd8..f8a62253c 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,19 +15,6 @@
#if EIGEN_HAS_CXX11
-#if defined(EIGEN_USE_SYCL)
-#undef min
-#undef max
-#undef isnan
-#undef isinf
-#undef isfinite
-#include <CL/sycl.hpp>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <utility>
-#endif
-
#include "../SpecialFunctions"
#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
@@ -72,7 +59,7 @@ typedef unsigned __int64 uint64_t;
#include <time.h>
#endif
-#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
#include "ThreadPool"
#endif
@@ -147,7 +134,13 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorScan.h"
#include "src/Tensor/TensorTrace.h"
-#include "src/Tensor/TensorSycl.h"
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
#include "src/Tensor/TensorExecutor.h"
#include "src/Tensor/TensorDevice.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
deleted file mode 100644
index 2184c94b3..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ /dev/null
@@ -1,152 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorArgMaxSycl.h
- * \brief:
- * TensorArgMaxSycl
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
-namespace Eigen {
-namespace internal {
- template<typename Dims, typename XprType>
- struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
- {
- typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
- };
-
- template<typename Dims, typename XprType>
- struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
- typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
- {
- typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
- };
-
-template<typename StrideDims, typename XprType>
-struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
-{
- typedef traits<XprType> XprTraits;
- typedef typename XprTraits::StorageKind StorageKind;
- typedef typename XprTraits::Index Index;
- typedef Index Scalar;
- typedef typename XprType::Nested Nested;
- typedef typename remove_reference<Nested>::type _Nested;
- static const int NumDimensions = XprTraits::NumDimensions;
- static const int Layout = XprTraits::Layout;
-};
-
-
-}// end namespace internal
-template<typename StrideDims, typename XprType>
-class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
-{
- public:
- typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
- typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
- typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
- typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
- typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
- typedef typename XprType::CoeffReturnType TupleType;
- typedef Index CoeffReturnType;
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
- const Index return_dim,
- const StrideDims strides,
- const Index stride_mod, const Index stride_div)
- :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
-
- EIGEN_DEVICE_FUNC
- const typename internal::remove_all<typename XprType::Nested>::type&
- expression() const { return m_xpr; }
-
- EIGEN_DEVICE_FUNC
- Index return_dim() const { return m_return_dim; }
-
- EIGEN_DEVICE_FUNC
- const StrideDims& strides() const { return m_strides; }
-
- EIGEN_DEVICE_FUNC
- const Index& stride_mod() const { return m_stride_mod; }
-
- EIGEN_DEVICE_FUNC
- const Index& stride_div() const { return m_stride_div; }
-
- protected:
- typename Eigen::internal::remove_all<typename
- XprType::Nested
- >::type m_xpr;
- const Index m_return_dim;
- const StrideDims m_strides;
- const Index m_stride_mod;
- const Index m_stride_div;
-};
-
-
-// Eval as rvalue
-template<typename StrideDims, typename ArgType>
-struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
-{
- typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename XprType::TupleType TupleType;
- typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
-
- enum {
- IsAligned = false,
- PacketAccess = false,
- BlockAccessV2 = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
- CoordAccess = false,
- RawAccess = false
- };
-
- //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
- //===--------------------------------------------------------------------===//
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
- : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
- m_stride_div(op.stride_div()){}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
- return m_impl.dimensions();
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
- m_impl.evalSubExprsIfNeeded(NULL);
- return true;
- }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- m_impl.cleanup();
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
- const TupleType v = m_impl.coeff(index);
- return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
- }
-typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
-
-protected:
- TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
- const Index m_return_dim;
- const StrideDims m_strides;
- const Index m_stride_mod;
- const Index m_stride_div;
-};
-} // end namespace Eigen
-#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 50865d404..9ab900b4a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -545,6 +545,10 @@ class TensorContractionInputMapper
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(*this, i, j);
}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+ return Base::m_tensor;
+ }
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index 35f931c53..a6ca1777a 100644..100755
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1,136 +1,1386 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
- * TensorTensorContractionsycl.h
+ * TensorContractionSycl.h
*
* \brief:
- * TensorContractionsycl
+ * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
*
-*****************************************************************/
+ *****************************************************************/
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
namespace Eigen {
-template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> :
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> > {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+ // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ // TileSizeDimNC: determines the tile size for the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+ // TileSizeDimC: determines the tile size for the contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+ // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+ // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+ // BC : determines if supporting bank conflict is required
+ static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+ // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+ // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+ // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+ // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ // TileSizeDimM: determines the tile size for the m dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+ // TileSizeDimN: determines the tile size for the n dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+ // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+ static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+ ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+ // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+ static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+ ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+ // BC : determines if supporting bank conflict is required
+ static EIGEN_CONSTEXPR bool BC = true;
+ // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+ // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory)
+ static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+ false;
+#else
+ true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+ typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+ const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+ const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+ const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+ return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+ const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+ const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+ const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+ return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+ write(PacketType &packet_data, DataScalar ptr) {
+ EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; i++) {
+ *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+ ptr += ld;
+ }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+ Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+ ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+ Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+ *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+ return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+ return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+ static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+ typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+ static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+ typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+ static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+ static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+ static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+ static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+ const StorageIndex linearLocalThreadId;
+ const StorageIndex kGroupId;
+ const StorageIndex mGroupOffset;
+ const StorageIndex nGroupOffset;
+ const StorageIndex kGroupOffset;
+ const StorageIndex mLocalOffset;
+ const StorageIndex nLocalOffset;
+ const StorageIndex mGlobalOffset;
+ const StorageIndex nGlobalOffset;
+ StorageIndex kSize;
+ const bool is_internal;
+ // this is used to adjust the last block
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+ const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+ const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+ const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+ StorageIndex kSize_, const bool is_internal_)
+ : linearLocalThreadId(linearLocalThreadId_),
+ kGroupId(kGroupId_),
+ mGroupOffset(mGroupOffset_),
+ nGroupOffset(nGroupOffset_),
+ kGroupOffset(kGroupOffset_),
+ mLocalOffset(mLocalOffset_),
+ nLocalOffset(nLocalOffset_),
+ mGlobalOffset(mGlobalOffset_),
+ nGlobalOffset(nGlobalOffset_),
+ kSize(kSize_),
+ is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+ typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+ typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+ typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+ PacketReturnType;
+ static EIGEN_CONSTEXPR int PacketSize =
+ Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+ static EIGEN_CONSTEXPR bool is_lhs_transposed =
+ !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+ static EIGEN_CONSTEXPR bool is_rhs_transposed =
+ !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+ typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+ PacketReturnType>
+ LHSBlockProperties;
+
+ typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+ PacketReturnType>
+ RHSBlockProperties;
+
+ static EIGEN_CONSTEXPR StorageIndex NStride =
+ contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+ typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+ typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+ typedef
+ typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+ tile_ptr;
+ static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimM + Properties::BC
+ : Properties::WorkLoadPerThreadM;
+ static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimN + Properties::BC
+ : Properties::WorkLoadPerThreadN;
+ static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+ /**
+ * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+ * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+ * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+ * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+ * different type of memory needed when local/no_local memory computation is called.
+ *
+ * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+ of the algorithm to be used
+ * \tparam the private memory size
+ * \param ptr the tile memory pointer type
+ */
+ template <contraction_type, StorageIndex>
+ struct MemHolder {
+ tile_ptr ptr;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+ };
+ /**
+ * \brief specialization of memHolder class when no local memory kernel is used.
+ */
+ template <StorageIndex MemSize>
+ struct MemHolder<contraction_type::no_local, MemSize> {
+ OutScalar ptr[MemSize] = {OutScalar{0}};
+ };
+ /**
+ * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from
+ * global memory to local/private memory when local/no_local algorithm used.
+ *
+ * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+ * selected contraction_type.
+ *
+ * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+ * selected contraction_type.
+ *
+ * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+ * memory is used this is set to zero as this is not applicable in case of private memory.
+ *
+ * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+ * memory is used this is set to zero as this is not applicable in case of private memory.
+ *
+ * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the
+ * same as lhs_scratch_extract for private memory.
+ *
+ * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the
+ * same as rhs_scratch_extract for private memory.
+ */
+ struct TiledMemory {
+ MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+ MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+ tile_ptr lhs_scratch_ptr_compute;
+ tile_ptr rhs_scratch_ptr_compute;
+ const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+ const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+ template <contraction_type tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+ typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+ : lhs_scratch_extract{},
+ rhs_scratch_extract{},
+ lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+ rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+ lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+ rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+ template <contraction_type tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+ typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+ : lhs_scratch_extract{block_start_ptr},
+ rhs_scratch_extract{lhs_scratch_extract.ptr +
+ ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+ lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+ rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+ lhs_extract_index(
+ local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+ rhs_extract_index(
+ local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+ };
+
+ Scratch scratch;
+ const LhsMapper lhs;
+ const RhsMapper rhs;
+ OutAccessor out_res;
+ const StorageIndex groupSizeM;
+ const StorageIndex groupSizeN;
+ const StorageIndex numTiles;
+ const TripleDim triple_dim;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+ const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex groupSizeM_,
+ const StorageIndex groupSizeN_,
+ const StorageIndex numTiles_,
+ const TripleDim triple_dim_)
+ : scratch(scratch_),
+ lhs(lhs_),
+ rhs(rhs_),
+ out_res(out_res_),
+ groupSizeM(groupSizeM_),
+ groupSizeN(groupSizeN_),
+ numTiles(numTiles_),
+ triple_dim(triple_dim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+ const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex groupSizeM_,
+ const StorageIndex numTiles_,
+ const TripleDim triple_dim_)
+ : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+ const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+ const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+ const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+ const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+ const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+ const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+ const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+ const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+ const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+ const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+ const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+ const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+ const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+ StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+ const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+ triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+ triple_dim.K - kGroupOffset >= kSizePerWG;
+ // this is used to adjust the last block
+ StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+ // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+ // tile
+ kGroupOffset += kSize;
+
+ auto thread_properties =
+ ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+ mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+ auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+ (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+ : compute_panel<false>(itemID, thread_properties, out_ptr);
+ }
+ // The compute block computes the contraction operation private block for each thread and store the resutl in the
+ // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+ // it only compute the block on each thread's private memory space
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+ PacketReturnType *privateRes) {
+ StorageIndex idx = 0;
+ EIGEN_CONSTEXPR StorageIndex lhs_stride =
+ contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+ auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+ StorageIndex lhs_index = 0;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+ PacketReturnType lhsPack{};
+ Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+ lhs_block_ptr + lhs_index);
+ privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+ lhs_index += lhs_stride;
+ idx++;
+ }
+ }
+ }
+ // The store function write the computed contraction operation in the private memory of each thread to the global
+ // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+ // class.
+ template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+ StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+ auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+ return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+ };
+ // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+ // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+ // WorkLoadPerThreadN slice of N
+ EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+ contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+ // output leading dimension
+ StorageIndex outputLD = 0;
+ // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+ // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+ // not used and RHS is transposed we packetize the load for RHS.
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+ StorageIndex globalRow = mGlobalOffset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+ PacketReturnType privetOut = privateRes[wLPTM];
+ if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+ // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+ // StorageIndex Therefore it is always coalesced layout
+ write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+ StorageIndex mOffset = globalRow + mId;
+ if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+ out_ptr[mOffset + outputLD] =
+ Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+ }
+ }
+ }
+ globalRow += (PacketSize * Properties::LocalThreadSizeM);
+ }
+ outputLD += triple_dim.M;
+ privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+ }
+ out_ptr += (GlobalNStride * outputLD);
+
+ nGlobalOffset += (PrivateNStride * GlobalNStride);
+ }
+ }
+ // when no local memory is used the following extract_block will be enabled
+ template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+ contraction_type contract_tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+ extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+ const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+ InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+ EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+ InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+ const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+ auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+ return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+ (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+ };
+ const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+ StorageIndex cIndex = cOffset;
+
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+ StorageIndex ncIndex = ncOffset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+ if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+ auto val =
+ read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+ data_source::private_mem>(val, private_ptr);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+ const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+ OutScalar val =
+ (ncInd < NC && cInd < triple_dim.K)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, ncInd, cInd, ld)
+ : OutScalar(0);
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+ data_source::private_mem>(
+ val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+ ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+ }
+ }
+
+ // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+ // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+ ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+ ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+ : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+ private_ptr += InputBlockProperties::nc_stride;
+ }
+ // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+ private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+ cIndex += InputBlockProperties::c_stride;
+ }
+ }
+ template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+ const StorageIndex &linearLocalThreadId) {
+ const StorageIndex localThreadNC =
+ (InputBlockProperties::is_coalesced_layout)
+ ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ const StorageIndex localThreadC =
+ (InputBlockProperties::is_coalesced_layout)
+ ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+ }
+
+ template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+ sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+ db_offset = !db_offset;
+ }
+
+ template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+ sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+
+ template <contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+ sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+ return;
+ }
+
+ template <bool need_sync, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+ sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+ itemID
+#endif
+ ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+ itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+ return;
+#endif
+ }
+ template <bool need_sync, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+ sync_thread(const cl::sycl::nd_item<1> &itemID) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ template <bool need_sync>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+ const cl::sycl::nd_item<1> &) {
+ return;
+ }
+ template <bool is_internal_block>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+ ThreadProperties<StorageIndex> &thread_properties,
+ TiledMemory &tiled_input_block,
+ PacketReturnType *privateRes, bool &db_offset) {
+ // Tiling the Rhs block from global to local memory
+ extract_block<RHSBlockProperties, is_internal_block>(
+ rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+ tiled_input_block.rhs_extract_index,
+ contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+ thread_properties.kGroupOffset - thread_properties.kSize);
+
+ sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+ // Tiling the Lhs block from global to local memory
+ extract_block<LHSBlockProperties, is_internal_block>(
+ lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+ tiled_input_block.lhs_extract_index,
+ contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+ thread_properties.kGroupOffset - thread_properties.kSize);
+
+ // itemID.barrier(cl::sycl::access::fence_space::local_space);
+ sync_thread<contraction_tp == contraction_type::local>(itemID);
+ // switch to compute mede
+ StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+ StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+ // Loop over the values of a single tile
+ for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+ compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+ tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+ lhs_offset += LSDL;
+ rhs_offset += LSDR;
+ }
+ // computing the K index for the next tile
+ thread_properties.kSize -= Properties::TileSizeDimK;
+ sync_mem(itemID, db_offset);
+ }
+
+ // when local memory is available the following compute_panel will be enabled
+ template <bool is_internal_block, typename OutPtr>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+ ThreadProperties<StorageIndex> &thread_properties,
+ OutPtr out_ptr) {
+ auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+ // Allocate register space
+ PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+ PacketReturnType{0}};
+ bool db_offset = 0;
+
+ while (thread_properties.kSize >= Properties::TileSizeDimK) {
+ compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+ }
+ if (thread_properties.kSize > 0) {
+ compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+ }
+
+ // Storing the final results in the output
+ store<is_internal_block,
+ contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+ out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+ thread_properties.nGlobalOffset);
+ }
+ // When local memory is available the following extract_block will be enabled
+ template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+ contraction_type contract_tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+ extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+ const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+ InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+ EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+ InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+ EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+ static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+ (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+ " LocalOffset must be divisable by stride");
+ const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+ StorageIndex localThreadNC = local_index.first;
+ StorageIndex localThreadC = local_index.second;
+ auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+ return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+ (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+ };
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+ const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+ const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+ const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+ if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+ auto val =
+ read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+ val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+ (InputBlockProperties::c_stride * localThreadC * LSD));
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+ const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+ OutScalar val =
+ (nCInd < NC && cInd < triple_dim.K)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, nCInd, cInd, ld)
+ : OutScalar(0);
+
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+ val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+ (InputBlockProperties::is_coalesced_layout ? i : 0) +
+ ((InputBlockProperties::c_stride * localThreadC +
+ (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+ LSD));
+ }
+ }
+ localThreadNC += (InputBlockProperties::is_coalesced_layout)
+ ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ localThreadC += (InputBlockProperties::is_coalesced_layout)
+ ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ }
+ }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+ typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+ typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+ PacketReturnType;
+ static EIGEN_CONSTEXPR int PacketSize =
+ Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+ static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+ KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+ // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+ // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+ typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+ VecBlockProperties;
+
+ Scratch scratch;
+ const VectorMapper vec;
+ const TensorMapper mat;
+ OutAccessor out_res;
+ const StorageIndex nonContractGroupSize;
+ const StorageIndex nonContractDim;
+ const StorageIndex contractDim;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+ const TensorMapper mat_, OutAccessor out_res_,
+ const StorageIndex nonContractGroupSize_,
+ const StorageIndex nonContractDim_,
+ const StorageIndex contractDim_)
+ : scratch(scratch_),
+ vec(vec_),
+ mat(mat_),
+ out_res(out_res_),
+ nonContractGroupSize(nonContractGroupSize_),
+ nonContractDim(nonContractDim_),
+ contractDim(contractDim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto scratch_ptr = scratch.get_pointer();
+ const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+ StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+ : linearLocalThreadId % Properties::LocalThreadSizeNC;
+ StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+ : linearLocalThreadId / Properties::LocalThreadSizeNC;
+ const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+ const StorageIndex nonContractGroupId =
+ is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+ const StorageIndex contractGroupId =
+ is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+ auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+ const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+ const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+ auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+ const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+ const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+ auto local_output = scratch_ptr + OutScratchOffset;
+ const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+ contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+ is_internal
+ ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ scratch_ptr, contractGroupOffset,
+#endif
+ nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+ nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+ : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ scratch_ptr, contractGroupOffset,
+#endif
+ nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+ nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+ }
+ template <bool is_internal_block, typename OutPtr>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+ const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+ OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+ const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+ StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+ StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+ OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+ // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+ extract_block<VecBlockProperties, is_internal_block, KFactor,
+ Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+ vectorOffset, contractDim);
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+ StorageIndex privateOffsetC = 0;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+ StorageIndex privateOffsetNC = 0;
+ bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ auto vecScalar = *in_scratch_ptr;
+#else
+ auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+ ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+ is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+ : OutScalar(0);
+#endif
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ auto matScalar = (check_boundary<is_internal_block>(
+ contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+ ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+ : globalNonContractDimOffset + privateOffsetNC,
+ is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+ : globalContractDimOffset + privateOffsetC)
+ : OutScalar(0);
+
+ outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+ privateOffsetNC += Properties::LocalThreadSizeNC;
+ }
+ privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+ }
+
+ auto out_scratch_ptr = local_output + outScratchIndex;
+ // Each block of 16*16 element in shared memory should reduce to 16*1
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ *out_scratch_ptr = outScalar[j];
+
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ }
+ if (is_lhs_vec) {
+ nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+ contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+ outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+ }
+
+ out_scratch_ptr = local_output + outScratchIndex;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (contractId < offset) {
+ StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+ *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+ }
+ }
+ // moving to next 16 by 16 block
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ }
+
+ if (contractId == 0) {
+ out_scratch_ptr = local_output + nonContractId;
+ StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+ out_ptr += global_final_offset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+ auto res = *out_scratch_ptr;
+
+ *out_ptr = res;
+ out_ptr += Properties::LocalThreadSizeNC;
+ }
+ // moving to next 16 by 16 block to ge the next 16 reduced elements
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+ }
+ }
+ }
+
+ template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+ typename Local>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+ const StorageIndex &linearLocalThreadId,
+ const StorageIndex &cOffset, const StorageIndex &C) {
+ local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+ StorageIndex cIndex = cOffset;
+ for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+ if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+ auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+ cIndex, StorageIndex(1));
+ write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ OutScalar val =
+ (cIndex + i < C)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+ : OutScalar(0);
+ write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+ }
+ }
+ local_ptr += InputBlockProperties::c_stride * GroupSize;
+ cIndex += InputBlockProperties::c_stride * GroupSize;
+ }
+ }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+ typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+ Scratch scratch;
+ const LhsMapper lhs;
+ const RhsMapper rhs;
+ OutAccessor out_res;
+ const StorageIndex rng;
+
+ EIGEN_DEVICE_FUNC
+ GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex rng_)
+ : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+ EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+ auto out_ptr = out_res.get_pointer();
+ auto scratch_ptr = scratch.get_pointer().get();
+
+ StorageIndex globalid = itemID.get_global_id(0);
+ StorageIndex localid = itemID.get_local_id(0);
+ OutScalar accumulator = OutScalar(0);
+ for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+ }
+ auto out_scratch_ptr = scratch_ptr + localid;
+ *out_scratch_ptr = accumulator;
+ for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+ }
+ }
+ if (localid == 0) {
+ out_ptr[itemID.get_group(0)] = accumulator;
+ }
+ }
+};
+#endif
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+ Eigen::SyclDevice>
+ : public TensorContractionEvaluatorBase<TensorEvaluator<
+ const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
"SYCL tensor contraction does not support output kernels.");
- typedef const Eigen::SyclDevice Device;
+ typedef Eigen::SyclDevice Device;
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
- typedef typename XprType::Index Index;
+ typedef typename XprType::Index StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
+ typedef typename Base::Storage Storage;
+ typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+ struct TripleDim {
+ const StorageIndex M;
+ const StorageIndex N;
+ const StorageIndex K;
+ TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+ };
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
};
- // Most of the code is assuming that both input tensors are ColMajor. If the
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
- // will pretend B is LHS and A is RHS.
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+ static EIGEN_CONSTEXPR int LDims = Base::LDims;
+ static EIGEN_CONSTEXPR int RDims = Base::RDims;
+ static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
- static const int LDims =
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
- static const int RDims =
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
- static const int ContractDims = internal::array_size<Indices>::value;
+ typedef array<StorageIndex, LDims> left_dim_mapper_t;
+ typedef array<StorageIndex, RDims> right_dim_mapper_t;
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
-
- typedef array<Index, ContractDims> contract_t;
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
+ typedef array<StorageIndex, ContractDims> contract_t;
+ typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+ typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
static const int NumDims = LDims + RDims - 2 * ContractDims;
- typedef DSizes<Index, NumDims> Dimensions;
-
- // typedefs needed in evalTo
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+ typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+ typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+ typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
typedef typename LeftEvaluator::Dimensions LeftDimensions;
typedef typename RightEvaluator::Dimensions RightDimensions;
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
- Base(op, device) {}
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+ struct input_mapper_propertis {
+ static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+ static EIGEN_CONSTEXPR bool is_rhs_matrix =
+ (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
// We need to redefine this method to make nvcc happy
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- evalTo(data);
- return false;
- } else {
- this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
- evalTo(this->m_result);
- return true;
+ if (!data) {
+ this->m_result = this->m_device.get(
+ static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+ data = this->m_result;
}
+ evalToSycl(data);
+ return (this->m_result != NULL);
}
- const Eigen::SyclDevice& device() const {return this->m_device;}
- void evalTo(Scalar* buffer) const {
- // Here is the result
+ const Eigen::SyclDevice &device() const { return this->m_device; }
+ void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
if (this->m_lhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<true, true, true, Unaligned>(buffer);
- }
- else {
+ } else {
evalTyped<true, true, false, Unaligned>(buffer);
}
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
+ } else {
+ if (this->m_rhs_inner_dim_reordered) {
evalTyped<true, false, true, Unaligned>(buffer);
- }
- else {
+ } else {
evalTyped<true, false, false, Unaligned>(buffer);
}
}
- }
- else {
+ } else {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
evalTyped<false, true, true, Unaligned>(buffer);
- }
- else {
+ } else {
evalTyped<false, true, false, Unaligned>(buffer);
}
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
+ } else {
+ if (this->m_rhs_inner_dim_reordered) {
evalTyped<false, false, true, Unaligned>(buffer);
- }
- else {
+ } else {
evalTyped<false, false, false, Unaligned>(buffer);
}
}
@@ -138,267 +1388,263 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalTyped(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
- EIGEN_UNUSED_VARIABLE(k)
- // rows in left side
- const Index m = this->m_i_size;
- // columns in right side
- const Index n = this->m_j_size;
-
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
- LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
- this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
- this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
+ void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+ const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+ typedef internal::TensorContractionInputMapper<
+ LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+ PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+ LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+ right_nocontract_t, contract_t,
+ PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+ RhsMapper;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+ if (triple_dim.M == 1 && triple_dim.N == 1) {
+ launchSC(buffer, lhs, rhs, triple_dim.K);
+ } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+ if (triple_dim.M != 1 && triple_dim.N == 1) {
+ LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+ } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+ LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+ } else // This is equivalent of if (m!=1 && n!=1)
+#endif
+ {
+ typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+ inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+ bool skinny = false;
+ auto platform_name = this->device().getPlatformName();
+ // This is based on empirical calculation for AMD r9-nano and Fiji
+ if (platform_name.find("AMD") == 0) {
+ skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+ ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+ (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+ } else {
+ skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+ ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+ ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+ }
+ if (skinny)
+ adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+ else
+#endif // EIGEN_SYCL_DISABLE_SKINNY
+ adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+ }
}
- // required by sycl to construct the expr on the device. Returns original left_impl
- const TensorEvaluator<LeftArgType, Device>& left_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
+
+ template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+ void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ if (device().has_local_memory()) {
+ typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+ launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+ buffer, lhs, rhs, triple_dim);
+ }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+ if (!(device().has_local_memory())) {
+ typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+ launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+ buffer, lhs, rhs, triple_dim);
+ }
+#endif
}
- // required by sycl to construct the expr on the device. Returns original right_impl
- const TensorEvaluator<RightArgType, Device>& right_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
+
+ template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+ typename Properties, typename LhsMapper, typename RhsMapper>
+ void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ const TripleDim &triple_dim) const {
+ const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+ const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+ const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+ const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+ const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+ StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+ StorageIndex groupSizeK =
+ skinny
+ ? std::max(std::min(totalTilesK,
+ (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+ (groupSizeM * groupSizeN)),
+ StorageIndex(1))
+ : StorageIndex(1);
+
+ const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+ const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+ const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+ const StorageIndex globalRange = totalGroupSize * localRange;
+
+ const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+ ? ((Properties::DoubleBuffer + 1) *
+ (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+ ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+ (Properties::TileSizeDimN + Properties::BC))
+ : StorageIndex(1);
+
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (groupSizeK == 1) {
+ typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+ LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+ PacketAccess, input_mapper_properties, true, ct>
+ ContractKernelName;
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+ } else {
+ typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+ LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+ PacketAccess, input_mapper_properties, false, ct>
+ ContractKernelName;
+ CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+ device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+ triple_dim);
+
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ auto op = Op();
+ typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+ EvaluatorPointerType, Op>
+ ReductionKernel;
+
+ device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+ Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+ cl::sycl::range<1>(localRange)),
+ StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+ device().deallocate_temp(temp_pointer);
+ }
}
-};
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
-typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
-typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
- LHSFunctorExpr lhs_functors;
- RHSFunctorExpr rhs_functors;
- LhsLocalAcc localLhs;
- RhsLocalAcc localRhs;
- OutAccessor out_res;
- size_t out_offset;
- Index roundUpK, M, N, K;
- ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
- LeftNocontractT m_i_strides, m_left_nocontract_strides;
- RightNocontractT m_j_strides, m_right_nocontract_strides;
- LHSTupleType left_tuple_of_accessors;
- RHSTupleType right_tuple_of_accessors;
- Device dev;
-
-
- KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
- Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
- ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
- LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
- :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
- out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
- m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
- m_right_contracting_strides(m_right_contracting_strides_),
- m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
- m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
- left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
-
- void operator()(cl::sycl::nd_item<2> itemID) {
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
- auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
- auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
- typedef decltype(lhs_dev_expr.expr) LeftArgType;
- typedef decltype(rhs_dev_expr.expr) RightArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- LeftEvaluator, LeftNocontractT,
- ContractT, 1,
- lhs_inner_dim_contiguous,
- false, Unaligned, MakeGlobalPointer> LhsMapper;
-
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- RightEvaluator, RightNocontractT,
- ContractT, 1,
- rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
- // initialize data mappers must happen inside the kernel for device eval
- LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
- RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
- auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
- // Matmul Kernel
- // Thread identifiers
- const Index mLocalThreadId = itemID.get_local(0); // Local ID row
- const Index nLocalThreadId = itemID.get_local(1); // Local ID col
- const Index mGroupId = itemID.get_group(0); // Work-group ID row
- const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
- const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
- // Allocate register space
- LhsScalar privateLhs;
- RhsScalar privateRhs[WorkLoadPerThreadN];
- OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
- // Initialise the privateResumulation registers
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0);
- }
- }
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+ template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+ void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+ StorageIndex NC, StorageIndex C) const {
+ const StorageIndex nonContractDim = NC;
+ EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+ EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+ EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+ typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+ Properties;
+ const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+ const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+ const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+ const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+ const StorageIndex globalRange =
+ (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+ const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+ const StorageIndex scratchSize =
+ (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (cNumGroups > 1) {
+ typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+ TensorMapper, StorageIndex, Properties, CFactor, false,
+ is_lhs_vec, false>
+ ContractKernelName;
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
- // Tile Lhs
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // Load the value (wide vector load)
- Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
- localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
- }
- // Tile Rhs
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
- localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
- }
- // Loop over all tiles
- const Index numTiles = roundUpK/TileSizeDimK;
- Index firstHalf=0;
- do {
- // Synchronise
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- // Load the next tile of Lhs and Rhs into local memory
- Index nextHalf = firstHalf + 1;
- if (nextHalf < numTiles) {
- // Tile A
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // global K id
- Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
- // Store the loaded value into local memory
- localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
- }
- // Tile B
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
- // Store the loaded vector into local memory
- localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
- }
- }
- // Loop over the values of a single tile
- for (Index k=0; k<TileSizeDimK; k++) {
- // Cache the values of localRhs in registers
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
- privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
- }
- // Perform the computation
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
- privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
- }
- }
- }
- // Next tile
- firstHalf++;
- } while (firstHalf<numTiles);
-
- // Store the final results in C
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
- if (globalRow< M){
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
- if(globalCol<N)
- out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
- }
- }
- }
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+ EvaluatorPointerType, Op>
+ ReductionKernel;
+ device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+ cl::sycl::range<1>(localRange)),
+ StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+ device().deallocate_temp(temp_pointer);
+ } else {
+ typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+ TensorMapper, StorageIndex, Properties, CFactor, false,
+ is_lhs_vec, true>
+ ContractKernelName;
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
}
+ }
+#endif
-};
-template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const Index TileSizeDimM = 32ul; // Tile size for dimension M
-static const Index TileSizeDimN = 32ul; // Tile size for dimension N
-static const Index TileSizeDimK = 16ul; // Tile size for dimension K
-static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M
-static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N
-static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here)
-static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here)
-static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression
-static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression
-
-// RoundUp function to make sure that the global threadId is divisable by local threadId
-static Index RoundUp(Index x, Index y) {
- return ((((x) + (y) - 1) / (y))*(y));
-}
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+ template <typename LhsMapper, typename RhsMapper>
+ EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ StorageIndex K) const {
+ EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+ EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
-template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
- static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
- ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
- LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-
- typedef typename Self::XprType HostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
- typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
- // extract lhs functor list
- LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
- // extract rhs functor list
- RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
-
- Index roundUpK = RoundUp(K, TileSizeDimK);
- Index roundUpM = RoundUp(M, TileSizeDimM);
- Index roundUpN = RoundUp(N, TileSizeDimN);
- ptrdiff_t out_offset = self.device().get_offset(buffer);
- self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
- // create lhs tuple of accessors
- LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
- // create rhs tuple of accessors
- RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
-
- // Local memory for elements of Lhs
- typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
- LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
- // Local memory for elements of Rhs
- typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
- RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
- //OutScalar memory
- OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
- // sycl parallel for
- cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
- cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
- KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
- RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
- WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors,
- localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
- m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice()));
- });
- self.device().asynchronousExec();
+ // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+ // reduces at least 512 elementss individually, we get better performance.
+ const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+ const StorageIndex global_range = num_work_group * local_range;
+
+ typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+ CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+ ContractKernelName;
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ if (num_work_group > 1) {
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+ thread_range, local_range, K);
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+ EvaluatorPointerType, StorageIndex, local_range>
+ GenericRKernel;
+ device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+ device().deallocate_temp(temp_pointer);
+ } else {
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+ local_range, K);
+ }
}
-};
+#endif
-} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ this->m_leftImpl.cleanup();
+ this->m_rightImpl.cleanup();
+
+ if (this->m_result) {
+ this->m_device.deallocate_temp(this->m_result);
+ this->m_result = NULL;
+ }
+ }
+ // The placeholder accessors must bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ this->m_leftImpl.bind(cgh);
+ this->m_rightImpl.bind(cgh);
+ this->m_result.bind(cgh);
+ }
+};
+} // namespace Eigen
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 5c94165d1..0218727d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -18,207 +18,252 @@
namespace Eigen {
/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel1D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize, range_x, range_y;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
- Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
- buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV1D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+ const size_t kernelSize;
+ const cl::sycl::range<2> input_range;
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+ const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernelSize(kernelSize_),
+ input_range(input_range_) {}
+
+ template <typename BooleanDim2>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+ return (boolean_check[0] && boolean_check[1]);
+ }
void operator()(cl::sycl::nd_item<2> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
- const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ // the required row to be calculated for the for each plane in shered memory
+ const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+ const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+ const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+ const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
/// fill the shared memory
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + plane_kernel_offset ;
- const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
- if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
+ for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + plane_kernel_offset;
+ const size_t tensor_index =
+ plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+ local_acc[local_index] =
+ (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+ ? device_evaluator.coeff(tensor_index)
+ : CoeffReturnType(0);
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
- // calculate the convolution
- const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
+ // calculate the convolution // output start x
+ const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+ if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
- const size_t index = plane_kernel_offset+ itemID.get_local(0);
+ const size_t index = plane_kernel_offset + itemID.get_local_id(0);
for (size_t k = 0; k < kernelSize; ++k) {
result += (local_acc[k + index] * kernel_ptr[k]);
}
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
- buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+ buffer_ptr[tensor_index] = result;
}
}
};
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel2D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
- Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
- buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV2D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+ const cl::sycl::range<2> kernel_size;
+ const cl::sycl::range<3> input_range;
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+ const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernel_size(kernel_size_),
+ input_range(input_range_) {}
+ template <typename BooleanDim3>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+ }
void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
- const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-
- /// fill the shared memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ // the required row to be calculated for the for each plane in shered memory
+ const auto num_input = cl::sycl::range<2>{
+ (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+ const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+ const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+ const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+ itemID.get_group(1) * itemID.get_local_range()[1]};
+
+ // fill the local memory
+ bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+ for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+ const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+ bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
+ for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
const size_t local_index = i + local_input_offset;
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
+ const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+ i + input_offset[0], j + input_offset[1]);
+ local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+ in_range_dim1 && in_range_dim2)
+ ? device_evaluator.coeff(tensor_index)
+ : CoeffReturnType(0);
+ }
}
- }
itemID.barrier(cl::sycl::access::fence_space::local_space);
- // calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+ // output offset start for each thread
+ const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+ itemID.get_group(1) * itemID.get_local_range()[1]};
+
+ if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t j = 0; j < kernelSize_y; j++) {
- size_t kernel_offset =kernelSize_x * j;
- const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
- for (size_t i = 0; i < kernelSize_x; i++) {
- result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
+
+ for (size_t j = 0; j < kernel_size[1]; j++) {
+ size_t kernel_offset = kernel_size[0] * j;
+ const size_t index =
+ (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+ for (size_t i = 0; i < kernel_size[0]; i++) {
+ result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
}
}
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
- buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+ itemID.get_local_id(1) + output_offset[1]);
+
+ buffer_ptr[tensor_index] = result;
}
}
};
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV3D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+ const cl::sycl::range<3> kernel_size;
+ const cl::sycl::range<3> input_range;
+ const size_t numP;
+
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+ const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+ const size_t numP_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernel_size(kernel_size_),
+ input_range(input_range_),
+ numP(numP_) {}
+ template <typename BooleanDim3>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+ }
+ void operator()(cl::sycl::nd_item<3> itemID) {
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+ const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel3D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
- const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
- Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
- kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
- buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+ const auto output_offset =
+ cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
- void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
- for(size_t p=0; p<numP; p++){
+ for (size_t p = 0; p < numP; p++) {
/// fill the shared memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
- for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
+ const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+ for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+ size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+ bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+ for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+ bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+ size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
+ for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+ bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+ const size_t local_index = local_index_dim1 + i;
+ const size_t tensor_index =
+ plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+ i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+ local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
}
}
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
// calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
- const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+ if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t k = 0; k < kernelSize_z; k++) {
- for (size_t j = 0; j < kernelSize_y; j++) {
- for (size_t i = 0; i < kernelSize_x; i++) {
- const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
- const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
+ for (size_t k = 0; k < kernel_size[2]; k++) {
+ for (size_t j = 0; j < kernel_size[1]; j++) {
+ for (size_t i = 0; i < kernel_size[0]; i++) {
+ const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+ const size_t local_index =
+ ((i + itemID.get_local_id(0)) +
+ num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
result += (local_acc[local_index] * kernel_ptr[kernel_index]);
}
}
}
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
- buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+ buffer_ptr[tensor_index] = result;
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
@@ -226,25 +271,32 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
}
};
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
-{
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
- static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
+ static const int NumDims =
+ internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
static const int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
- typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
+ typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
typedef const Eigen::SyclDevice Device;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
enum {
- IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
+ IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+ TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
- Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
+ Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
@@ -253,13 +305,22 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
- {
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
- const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+ : m_inputImpl(op.inputExpression(), device),
+ m_kernelArg(op.kernelExpression()),
+ m_kernelImpl(op.kernelExpression(), device),
+ m_indices(op.indices()),
+ m_buf(NULL),
+ m_kernel(NULL),
+ m_local_kernel(false),
+ m_device(device) {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+ static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+ m_kernelImpl.dimensions();
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
@@ -271,21 +332,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
}
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
- typedef typename InputArgType::Scalar Scalar;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
- EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+ EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
preloadKernel();
m_inputImpl.evalSubExprsIfNeeded(NULL);
if (data) {
executeEval(data);
return false;
} else {
- m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+ m_buf = (EvaluatorPointerType)m_device.get(
+ (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
executeEval(m_buf);
return true;
}
@@ -294,194 +351,194 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_buf) {
- m_device.deallocate(m_buf);
+ m_device.deallocate_temp(m_buf);
m_buf = NULL;
}
if (m_local_kernel) {
- m_device.deallocate((void*)m_kernel);
+ m_device.deallocate_temp(m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
/// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
/// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
// expression that needs to be evaluated)
- const Scalar* in_place = m_kernelImpl.data();
+ typename KernelStorage::Type in_place = m_kernelImpl.data();
if (in_place) {
m_kernel = in_place;
m_local_kernel = false;
} else {
ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
typedef TensorEvalToOp<const KernelArgType> EvalTo;
- EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+ EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const {
- typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+ typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
typedef typename InputEvaluator::Dimensions InputDims;
+ switch (NumKernelDims) {
+ case 1: {
+ const size_t numX = dimensions()[m_indices[0]];
+ const size_t numP = dimensions().TotalSize() / numX;
+ const auto input_dim = std::array<size_t, 2>{numX, numP};
+ auto global_range = cl::sycl::range<2>{};
+ auto local_range = cl::sycl::range<2>{};
+ const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+ const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ const array<Index, 1> indices{{m_indices[0]}};
+ const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+ internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+ ConvKernel;
+
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+ break;
+ }
- typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
- // extract input functor list
- InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
- ptrdiff_t out_offset = m_device.get_offset(data);
-
-
- m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
- /// work-around for gcc 4.8 auto bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
- // create input tuple of accessors
- InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
- OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
- KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
-
- switch (NumKernelDims) {
- case 1: {
- const size_t numX = dimensions()[m_indices[0]];
- const size_t numP = dimensions().TotalSize() / numX;
- const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
- m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
- const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
- gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
- auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 1> indices{{m_indices[0]}};
- const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
- internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
- EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- case 2: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numP = dimensions().TotalSize() / (numX*numY);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
- gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
- const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
- internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
- break;
- }
+ case 2: {
+ auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+ auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+ const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+ const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+ const size_t numP = dimensions().TotalSize() / (numX * numY);
+ auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+ auto global_range = cl::sycl::range<3>{};
+ auto local_range = cl::sycl::range<3>{};
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+ const size_t local_memory_size =
+ (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+ const array<Index, 2> kernel_dims{
+ {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+ internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+ ConvKernel;
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+ break;
+ }
- case 3: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
- const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numZ = dimensions()[m_indices[idxZ]];
- const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
- const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
- const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
- internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
- gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
- numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
- break;
- }
+ case 3: {
+ auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+ auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+ const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+ const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+ const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+ auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+ const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+ const array<Index, 3> indices{
+ {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+ const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+ m_kernelImpl.dimensions()[kernel_index[1]],
+ m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+ internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+ auto global_range = cl::sycl::range<3>{};
+ auto local_range = cl::sycl::range<3>{};
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+ auto local_memory_range = (local_range + kernel_size - 1);
+ const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+ ConvKernel;
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+ break;
+ }
- default: {
- EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
- }
+ default: {
+ EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+ THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
}
- });
- m_device.asynchronousExec();
+ }
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
- {
- eigen_assert(m_buf);
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_buf != NULL);
eigen_assert(index < m_dimensions.TotalSize());
return m_buf[index];
}
- template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
- {
- eigen_assert(m_buf);
+ template <int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+ eigen_assert(m_buf != NULL);
eigen_assert(index < m_dimensions.TotalSize());
- return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
- costPerCoeff(bool vectorized) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
// model.
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
// We ignore the use of fused multiply-add.
- const double convolve_compute_cost =
- TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
const double firstIndex_compute_cost =
NumDims *
- (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
- TensorOpCost::DivCost<Index>());
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
- kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
- m_kernelImpl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, convolve_compute_cost, vectorized,
- PacketSize));
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+ }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_kernelImpl.bind(cgh);
+ m_inputImpl.bind(cgh);
+ m_buf.bind(cgh);
+ m_kernel.bind(cgh);
}
private:
// No assignment (copies are needed by the kernels)
- TensorEvaluator& operator = (const TensorEvaluator&);
- TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
+ TensorEvaluator &operator=(const TensorEvaluator &);
+ TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
KernelArgType m_kernelArg;
- TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
+ TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
Indices m_indices;
Dimensions m_dimensions;
- Scalar* m_buf;
- const Scalar* m_kernel;
+ EvaluatorPointerType m_buf;
+ typename KernelStorage::Type m_kernel;
bool m_local_kernel;
- const Eigen::SyclDevice& m_device;
-};
+ const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+}; // namespace Eigen
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6f8b6f193..df591c21d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,7 +16,6 @@
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
#include <unordered_set>
-
namespace Eigen {
namespace TensorSycl {
@@ -70,9 +69,9 @@ struct SyclDeviceInfo {
} // end namespace TensorSycl
typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
-// TensorFlow via the Eigen SYCL Backend.
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
-> decltype(cl::sycl::device::get_devices()) {
#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
@@ -421,6 +420,91 @@ class QueueInterface {
return pMapper.get_offset(ptr);
}
+ template <typename OutScalar, typename sycl_kernel, typename Lhs,
+ typename Rhs, typename OutPtr, typename Range, typename Index,
+ typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+ const Rhs &rhs, OutPtr outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ lhs.bind(cgh);
+ rhs.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename OutPtr, typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+ OutPtr &outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+
EIGEN_STRONG_INLINE void synchronize() const {
#ifdef EIGEN_EXCEPTIONS
m_queue.wait_and_throw();
@@ -429,6 +513,7 @@ class QueueInterface {
#endif
}
+
EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
set_latest_event(e);
#ifndef EIGEN_SYCL_ASYNC_EXECUTION
@@ -457,11 +542,10 @@ class QueueInterface {
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
- Index &tileSize0,
- Index &tileSize1, Index &rng0,
- Index &rng1, Index &GRange0,
- Index &GRange1) const {
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ std::array<Index, 2> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
@@ -469,26 +553,28 @@ class QueueInterface {
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize1 =
+ local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
- rng1 = dim1;
- if (rng1 == 0) rng1 = static_cast<Index>(1);
- GRange1 = rng1;
- if (tileSize1 > GRange1)
- tileSize1 = GRange1;
- else if (GRange1 > tileSize1) {
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
- tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
- rng0 = dim0;
- if (rng0 == 0) rng0 = static_cast<Index>(1);
- GRange0 = rng0;
- if (tileSize0 > GRange0)
- tileSize0 = GRange0;
- else if (GRange0 > tileSize0) {
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
@@ -496,9 +582,9 @@ class QueueInterface {
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
- Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
- Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
- Index &GRange1, Index &GRange2) const {
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ std::array<Index, 3> input_range = input_dim;
Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
max_workgroup_Size =
@@ -506,45 +592,48 @@ class QueueInterface {
EIGEN_SYCL_LOCAL_THREAD_DIM1),
static_cast<Index>(max_workgroup_Size));
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize2 =
+ local_range[2] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
- rng2 = dim2;
- if (rng2 == 0) rng1 = static_cast<Index>(1);
- GRange2 = rng2;
- if (tileSize2 > GRange2)
- tileSize2 = GRange2;
- else if (GRange2 > tileSize2) {
- Index xMode = static_cast<Index>(GRange2 % tileSize2);
- if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+ input_range[2] = input_dim[2];
+ if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[2] = input_range[2];
+ if (local_range[2] > global_range[2])
+ local_range[2] = global_range[2];
+ else if (global_range[2] > local_range[2]) {
+ Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+ if (xMode != 0)
+ global_range[2] += static_cast<Index>(local_range[2] - xMode);
}
pow_of_2 = static_cast<Index>(
- std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
- tileSize1 =
+ std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+ local_range[1] =
static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
- rng1 = dim1;
- if (rng1 == 0) rng1 = static_cast<Index>(1);
- GRange1 = rng1;
- if (tileSize1 > GRange1)
- tileSize1 = GRange1;
- else if (GRange1 > tileSize1) {
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
}
- tileSize0 =
- static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
- rng0 = dim0;
- if (rng0 == 0) rng0 = static_cast<Index>(1);
- GRange0 = rng0;
- if (tileSize0 > GRange0)
- tileSize0 = GRange0;
- else if (GRange0 > tileSize0) {
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+ local_range[0] = static_cast<Index>(max_workgroup_Size /
+ (local_range[1] * local_range[2]));
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
}
}
EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
return false;
#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
return true;
@@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase {
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
- Index &tileSize0,
- Index &tileSize1, Index &rng0,
- Index &rng1, Index &GRange0,
- Index &GRange1) const {
- queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
- rng1, GRange0, GRange1);
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(
- Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
- Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
- Index &GRange1, Index &GRange2) const {
- queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
- tileSize2, rng0, rng1, rng2, GRange0,
- GRange1, GRange2);
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
}
/// allocate device memory
@@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase {
EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
return queue_stream()->getDeviceVendor();
}
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+ queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+ queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+ queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 9926046b9..b83174ab7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
// SYCL Executor policy
#ifdef EIGEN_USE_SYCL
-template <bool Vectorizable, typename Evaluator>
-struct ExecExprFunctorKernel_impl {
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
typedef typename Evaluator::Index Index;
- const Index range;
- const Index vectorizable_threads;
Evaluator evaluator;
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
- const Index range_, const Index vectorizable_threads_,
- Evaluator evaluator_)
- : range(range_), vectorizable_threads(vectorizable_threads_),
- evaluator(evaluator_) {}
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
- operator()(cl::sycl::nd_item<1> itemID) {
+ const Index range;
+ template <typename Scratch>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+ const Scratch, Evaluator evaluator_, const Index range_)
+ : evaluator(evaluator_), range(range_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+ cl::sycl::nd_item<1> itemID) {
+ compute(itemID);
+ }
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
- EIGEN_UNROLL_LOOP
+
for (Index i = gId; i < range; i += total_threads) {
evaluator.evalScalar(i);
}
}
-};
-
-template <typename Evaluator>
-struct ExecExprFunctorKernel_impl<true, Evaluator> {
- typedef typename Evaluator::Index Index;
- const Index range;
- const Index vectorizable_threads;
- Evaluator evaluator;
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
- const Index range_, const Index vectorizable_threads_,
- Evaluator evaluator_)
- : range(range_), vectorizable_threads(vectorizable_threads_),
- evaluator(evaluator_) {}
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
- operator()(cl::sycl::nd_item<1> itemID) {
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
+ const Index vectorizedRange =
+ (range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());
- if (gId < vectorizable_threads) {
- const Index PacketSize = Eigen::internal::unpacket_traits<
- typename Evaluator::PacketReturnType>::size;
- evaluator.evalPacket(gId * PacketSize);
- gId += (vectorizable_threads * PacketSize);
- EIGEN_UNROLL_LOOP
- for (Index i = gId; i < range; i += vectorizable_threads) {
- evaluator.evalScalar(i);
- }
+ const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ const Index start = Evaluator::PacketSize * gId;
+ for (Index i = start; i < vectorizedRange; i += step) {
+ evaluator.evalPacket(i);
+ }
+ gId += vectorizedRange;
+ for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+ evaluator.evalScalar(i);
}
}
};
-template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
-struct ExecExprFunctorKernel
- : ExecExprFunctorKernel_impl<
- ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
- Evaluator> {
- ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
- const Evaluator &evaluator)
- : ExecExprFunctorKernel_impl<
- ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
- Evaluator>(range_, vectorizable_threads_, evaluator) {}
-};
-
-template <typename Expr, typename Evaluator>
-struct ExecExprFunctorKernel<Expr, false, Evaluator>
- : ExecExprFunctorKernel_impl<false, Evaluator> {
- ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
- const Evaluator &evaluator)
- : ExecExprFunctorKernel_impl<false, Evaluator>(
- range_, vectorizable_threads_, evaluator) {}
-};
-
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
- public:
+ public:
typedef typename Expression::Index Index;
- static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const Eigen::SyclDevice& dev) {
+ typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+ Evaluator evaluator(expr, dev);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
Index range, GRange, tileSize;
Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
total_size = (total_size == 0) ? 1 : total_size;
- const int PacketSize = Eigen::PacketType<
- typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
- Eigen::SyclDevice>::size;
- Index vectorizable_threads =
- static_cast<Index>(total_size / PacketSize);
+ const int PacketSize =
+ Eigen::PacketType<typename Evaluator::CoeffReturnType,
+ Eigen::SyclDevice>::size;
+ Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
range = total_size;
- auto f = [&](cl::sycl::handler &cgh) {
- evaluator.bind(cgh);
- typedef ExecExprFunctorKernel<Expression, true,
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
- conditional_vectorized_kernel;
-
- typedef ExecExprFunctorKernel<Expression, false,
- Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
- non_vectorized_kernel;
-// This is to make sure that an expression with a size less than vectorized size
-// will not call the vectorized kernel.
-// The reason for having this kernel is that the vectorisable parameter is a
-// compile-time parameter,
-// however, the size of a tensor is a run-time parameter
- (vectorizable_threads)
- ? cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
- dev.program().template get_kernel<vectorized_kernel>(),
-#endif
- cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
- cl::sycl::range<1>(tileSize)),
- conditional_vectorized_kernel(range, vectorizable_threads,
- evaluator))
- : cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
- dev.program().template get_kernel<non_vectorized_kernel>(),
-#endif
- cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
- cl::sycl::range<1>(tileSize)),
- non_vectorized_kernel(range, vectorizable_threads,
- evaluator));
- };
- cl::sycl::event e;
- EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
- dev.async_synchronize(e);
+
+ dev.template nullary_kernel_launcher<
+ typename Evaluator::CoeffReturnType,
+ ExecExprFunctorKernel<Evaluator> >(
+ evaluator,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+ cl::sycl::range<1>(tileSize)),
+ Index(1), range);
}
evaluator.cleanup();
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 389d5d906..b115e502b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -123,7 +123,7 @@ struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
namespace TensorSycl {
namespace internal{
-template <typename Evaluator, typename Op> class ReductionFunctor;
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
}
}
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 700337539..d3628f94e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -421,7 +421,7 @@ template <typename Index, typename Device, bool BlockAccess> struct MemcpyTrigge
#ifdef EIGEN_USE_GPU
template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> {
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
- EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+ EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
};
#endif
@@ -430,7 +430,7 @@ template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index
#ifdef EIGEN_USE_SYCL
template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> {
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
- EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+ EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
};
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 84604cf41..0bb1e643e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -946,7 +946,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
#endif
#if defined(EIGEN_USE_SYCL)
- template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
// SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
template <typename, typename, typename> friend struct internal::GenericReducer;
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index a379f5a94..387c3edf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,167 +11,576 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
*
* \brief:
- * This is the specialisation of the placeholder expression based on the
- * operation type
+ * This is the specialization of the reduction operation. Two phase reduction approach
+ * is used since the GPU does not have Global Synchronization for global memory among
+ * different work-group/thread block. To solve the problem, we need to create two kernels
+ * to reduce the data, where the first kernel reduce the data locally and each local
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
*
-*****************************************************************/
+ *****************************************************************/
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-
namespace Eigen {
+namespace TensorSycl {
namespace internal {
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
- do {
- auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
- cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
- cl::sycl::range<1>{std::min(length, local)}};
- /* Two accessors are used: one to the buffer that is being reduced,
- * and a second to local memory, used to store intermediate data. */
- auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
- auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
- typedef decltype(aI) InputAccessor;
- typedef decltype(aOut) OutputAccessor;
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
- LocalAccessor scratch(cl::sycl::range<1>(local), h);
-
- /* The parallel_for invocation chosen is the variant with an nd_item
- * parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local));
- };
- dev.sycl_queue().submit(f);
- dev.asynchronousExec();
-
- /* At this point, you could queue::wait_and_throw() to ensure that
- * errors are caught quickly. However, this would likely impact
- * performance negatively. */
- length = length / local;
-
- } while (length > 1);
-}
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+ typedef Op type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &) {
+ return accumulator;
+ }
};
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
-template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
- syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
- bufOut, out_offset, bufI, dev, length, local);
-}
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+ const Index &scale) {
+ ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+ return quotient_op(accumulator, CoeffReturnType(scale));
+ }
};
-/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
-/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
-/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
-// a leafNode.
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &scale) {
+ return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+ }
+};
- typedef typename Self::CoeffReturnType CoeffReturnType;
- static const bool HasOptimizedImplementation = false;
-
- static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
- typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
- int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
- size_t inputSize =self.impl().dimensions().TotalSize();
- size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
- size_t remaining = inputSize% red_factor;
- if(rng ==0) {
- red_factor=1;
- };
- size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
- size_t GRange=std::max((size_t )1, rng);
-
- // convert global range to power of 2 for redecution
- GRange--;
- GRange |= GRange >> 1;
- GRange |= GRange >> 2;
- GRange |= GRange >> 4;
- GRange |= GRange >> 8;
- GRange |= GRange >> 16;
-#if __x86_64__ || __ppc64__ || _WIN64
- GRange |= GRange >> 32;
-#endif
- GRange++;
- size_t outTileSize = tileSize;
- /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
- if (GRange < outTileSize) outTileSize=GRange;
- /// creating the shared memory for calculating reduction.
- /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
- /// recursively apply reduction on it in order to reduce the whole.
- auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
- // Dims dims= self.xprDims();
- //Op functor = reducer;
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is a workaround for gcc 4.8 bug
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
- // create a tuple of accessors from Evaluator
- TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(tmp_global_accessor) OutAccessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
- TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
- (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors));
- });
- dev.asynchronousExec();
-
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- auto out_buffer =dev.get_sycl_buffer(output);
- ptrdiff_t out_offset = dev.get_offset(output);
- /// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize);
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+ Index local_range>
+struct SecondStepFullReducer {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+ typedef typename OpDef::type Op;
+ LocalAccessor scratch;
+ InputAccessor aI;
+ OutputAccessor outAcc;
+ Op op;
+ SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+ : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ // Our empirical research shows that the best performance will be achieved
+ // when there is only one element per thread to reduce in the second step.
+ // in this step the second step reduction time is almost negligible.
+ // Hence, in the second step of reduction the input size is fixed to the
+ // local size, thus, there is only one element read per thread. The
+ // algorithm must be changed if the number of reduce per thread in the
+ // second step is greater than 1. Otherwise, the result will be wrong.
+ const Index localid = itemID.get_local_id(0);
+ auto aInPtr = aI.get_pointer() + localid;
+ auto aOutPtr = outAcc.get_pointer();
+ CoeffReturnType *scratchptr = scratch.get_pointer();
+ CoeffReturnType accumulator = *aInPtr;
+
+ scratchptr[localid] = op.finalize(accumulator);
+#pragma unroll 8
+ for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratchptr[localid + offset], &accumulator);
+ scratchptr[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) *aOutPtr = op.finalize(accumulator);
+ }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept
+// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+ (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ OpDef;
+
+ typedef typename OpDef::type Op;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::PacketReturnType PacketReturnType;
+ typedef
+ typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+ PacketReturnType, CoeffReturnType>::type OutType;
+ typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ LocalAccessor scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType final_output;
+ Index rng;
+ Op op;
+
+ FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+ Index rng_, OpType op_)
+ : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ Index start = Evaluator::PacketSize * globalid;
+ // vectorizable parts
+ PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+#pragma unroll(8 / Evaluator::PacketSize)
+ for (Index i = start; i < VectorizedRange; i += step) {
+ op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+ }
+ globalid += VectorizedRange;
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.template reducePacket<PacketReturnType>(
+ ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+ evaluator.impl().coeff(i), op.initialize()),
+ &packetAccumulator);
+ }
+ scratch[localid] = packetAccumulator =
+ OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+ // reduction parts // Local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+ scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] =
+ op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+ }
}
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ // vectorizable parts
+ CoeffReturnType accumulator = op.initialize();
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.reduce(evaluator.impl().coeff(i), &accumulator);
+ }
+ scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+ // reduction parts. the local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratch[localid + offset], &accumulator);
+ scratch[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+ }
+ }
};
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ template <typename Scratch>
+ GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+ Index range_, Index num_values_to_reduce_)
+ : evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ functor(OpDef::get_op(functor_)),
+ range(range_),
+ num_values_to_reduce(num_values_to_reduce_) {}
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ auto output_accessor_ptr = output_accessor.get_pointer();
+ /// const cast added as a naive solution to solve the qualifier drop error
+ Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+ if (globalid < range) {
+ CoeffReturnType accum = functor.initialize();
+ Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+ evaluator, evaluator.firstInput(globalid), functor, &accum);
+ output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+ }
+ }
+
+ private:
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op functor;
+ Index range;
+ Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAcc;
+ ScratchAcc scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op op;
+ const Index preserve_elements_num_groups;
+ const Index reduce_elements_num_groups;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+ const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+ const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+ : scratch(scratch_),
+ evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ preserve_elements_num_groups(preserve_elements_num_groups_),
+ reduce_elements_num_groups(reduce_elements_num_groups_),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+ CoeffReturnType &accumulator) {
+ if (globalPId >= num_coeffs_to_preserve) {
+ return;
+ }
+ Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+ : globalRId + (globalPId * num_coeffs_to_reduce);
+ Index localOffset = globalRId;
+
+ const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+ const Index per_thread_global_stride =
+ rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+#pragma unroll 8
+ for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+ op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+ localOffset += per_thread_local_stride;
+ global_offset += per_thread_global_stride;
+ }
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index linearLocalThreadId = itemID.get_local_id(0);
+ Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+ Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+ const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+ : itemID.get_group(0) / reduce_elements_num_groups;
+ const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+ : itemID.get_group(0) % reduce_elements_num_groups;
+
+ Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+ auto scratchPtr = scratch.get_pointer().get();
+ auto outPtr =
+ output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+ CoeffReturnType accumulator = op.initialize();
+
+ element_wise_reduce(globalRId, globalPId, accumulator);
+
+ accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+ scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+ accumulator;
+ if (rt == reduction_dim::inner_most) {
+ pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+ rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+ globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ }
+
+ /* Apply the reduction operation between the current local
+ * id and the one on the other half of the vector. */
+ auto out_scratch_ptr =
+ scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (rt == reduction_dim::inner_most) {
+ accumulator = *out_scratch_ptr;
+ }
+ // The Local LocalThreadSizeR is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+ if (rLocalThreadId < offset) {
+ op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+ // The result has already been divided for mean reducer in the
+ // previous reduction so no need to divide furthermore
+ *out_scratch_ptr = op.finalize(accumulator);
+ }
+ /* All threads collectively read from global memory into local.
+ * The barrier ensures all threads' IO is resolved before
+ * execution continues (strictly speaking, all threads within
+ * a single work-group - there is no co-ordination between
+ * work-groups, only work-items). */
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+
+ if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+ outPtr[globalPId] = op.finalize(accumulator);
+ }
+ }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+ typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAccessor;
+ InputAccessor input_accessor;
+ OutputAccessor output_accessor;
+ Op op;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+ OutputAccessor output_accessor_, OpType op_,
+ const Index num_coeffs_to_preserve_,
+ const Index num_coeffs_to_reduce_)
+ : input_accessor(input_accessor_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index globalId = itemID.get_global_id(0);
+
+ if (globalId >= num_coeffs_to_preserve) return;
+
+ auto in_ptr = input_accessor.get_pointer() + globalId;
+
+ OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+#pragma unroll 8
+ for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+ op.reduce(*in_ptr, &accumulator);
+ in_ptr += num_coeffs_to_preserve;
+ }
+ output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+ }
+}; // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+ static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+ static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+ static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
typedef typename Self::CoeffReturnType CoeffReturnType;
- static const bool HasOptimizedImplementation = false;
+ typedef typename Self::Storage Storage;
+ typedef typename Self::Index Index;
+ typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+ PannelParameters;
+
+ typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+ Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+ Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+ // getPowerOfTwo makes sure local range is power of 2 and <=
+ // maxSyclThreadPerBlock this will help us to avoid extra check on the
+ // kernel
+ static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+ (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+
+ EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+ // In this step, we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 64
+ // elemnts individually, we get better performance. However, this can change
+ // on different platforms. In this step we force the code not to be
+ // morthan step reduction: Our empirical research shows that for inner_most
+ // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+ // > 1024 to achieve the best performance.
+ const Index reductionPerThread = 64;
+ Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+ const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+ Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+ const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+ const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+ EIGEN_CONSTEXPR Index scratchSize =
+ PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (rNumGroups > 1) {
+ CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+ dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+ EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+
+ typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+ SecondStepPartialReductionKernel;
+
+ dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+ temp_accessor, output,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+ reducer, num_coeffs_to_preserve, rNumGroups);
+
+ self.device().deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+ }
+ return false;
+ }
+};
+} // namespace internal
+} // namespace TensorSycl
+
+namespace internal {
- static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
- typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+ static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+ typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+ static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+ EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+ typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+ // In this step we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 512
+ // elemnts individually, we get better performance.
+ const Index reductionPerThread = 2048;
+ // const Index num_work_group =
+ Index reductionGroup = dev.getPowerOfTwo(
+ (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+ const Index num_work_group = std::min(reductionGroup, local_range);
+ // 1
+ // ? local_range
+ // : 1);
+ const Index global_range = num_work_group * local_range;
+
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+ if (num_work_group > 1) {
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+ typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+ local_range, inputSize, reducer);
+
+ typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+ EvaluatorPointerType, Index, local_range>
+ GenericRKernel;
+ dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+ tmp_global_accessor, data,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+ reducer);
+
+ dev.deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+ reducer);
+ }
+ }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
+ }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
+ }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
typename Self::Index range, GRange, tileSize;
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- /// creating the shared memory for calculating reduction.
- /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
- /// recursively apply reduction on it in order to reduce the whole.
- dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is workaround for gcc 4.8 bug.
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
- // create a tuple of accessors from Evaluator
- Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
- ptrdiff_t out_offset = dev.get_offset(output);
- Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
-
- });
- dev.asynchronousExec();
+ dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+ dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+ TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+ self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+ reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
return false;
}
};
-} // end namespace internal
+} // namespace internal
} // namespace Eigen
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 000000000..0078692cd
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,512 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ * Tensor Scan Sycl implement the extend version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+ // must be power of 2
+ static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+ const index_t total_size;
+ const index_t non_scan_size;
+ const index_t scan_size;
+ const index_t non_scan_stride;
+ const index_t scan_stride;
+ const index_t panel_threads;
+ const index_t group_threads;
+ const index_t block_threads;
+ const index_t elements_per_group;
+ const index_t elements_per_block;
+ const index_t loop_range;
+
+ ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+ index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+ index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+ : total_size(total_size_),
+ non_scan_size(non_scan_size_),
+ scan_size(scan_size_),
+ non_scan_stride(non_scan_stride_),
+ scan_stride(scan_stride_),
+ panel_threads(panel_threads_),
+ group_threads(group_threads_),
+ block_threads(block_threads_),
+ elements_per_group(elements_per_group_),
+ elements_per_block(elements_per_block_),
+ loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+ scan_step stp>
+struct ScanKernelFunctor {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+ LocalAccessor scratch;
+ Evaluator dev_eval;
+ OutAccessor out_accessor;
+ OutAccessor temp_accessor;
+ const ScanParameters<Index> scanParameters;
+ Op accumulator;
+ const bool inclusive;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+ OutAccessor out_accessor_, OutAccessor temp_accessor_,
+ const ScanParameters<Index> scanParameters_, Op accumulator_,
+ const bool inclusive_)
+ : scratch(scratch_),
+ dev_eval(dev_eval_),
+ out_accessor(out_accessor_),
+ temp_accessor(temp_accessor_),
+ scanParameters(scanParameters_),
+ accumulator(accumulator_),
+ inclusive(inclusive_) {}
+
+ template <scan_step sst = stp, typename Input>
+ typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE
+ read(const Input &inpt, Index global_id) {
+ return inpt.coeff(global_id);
+ }
+
+ template <scan_step sst = stp, typename Input>
+ typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE
+ read(const Input &inpt, Index global_id) {
+ return inpt[global_id];
+ }
+
+ template <scan_step sst = stp, typename InclusiveOp>
+ typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+ inclusive_op();
+ }
+
+ template <scan_step sst = stp, typename InclusiveOp>
+ typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ first_step_inclusive_Operation(InclusiveOp) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto out_ptr = out_accessor.get_pointer();
+ auto tmp_ptr = temp_accessor.get_pointer();
+ auto scratch_ptr = scratch.get_pointer().get();
+
+ for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+ Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+ Index tmp = data_offset % scanParameters.panel_threads;
+ const Index panel_id = data_offset / scanParameters.panel_threads;
+ const Index group_id = tmp / scanParameters.group_threads;
+ tmp = tmp % scanParameters.group_threads;
+ const Index block_id = tmp / scanParameters.block_threads;
+ const Index local_id = tmp % scanParameters.block_threads;
+ // we put one element per packet in scratch_mem
+ const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+ const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+ CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+ CoeffReturnType inclusive_scan;
+ // the actual panel size is scan_size * non_scan_size.
+ // elements_per_panel is roundup to power of 2 for binary tree
+ const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+ const Index group_offset = group_id * scanParameters.non_scan_stride;
+ // This will be effective when the size is bigger than elements_per_block
+ const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+ const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+ const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+ Index next_elements = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+ (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size))
+ ? read(dev_eval, global_id)
+ : accumulator.initialize();
+ next_elements += scanParameters.scan_stride;
+ }
+ first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+ if (inclusive) {
+ inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+ }
+ });
+ // This for loop must be 2
+ EIGEN_UNROLL_LOOP
+ for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+ Index private_offset = 1;
+ // build sum in place up the tree
+ EIGEN_UNROLL_LOOP
+ for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+ EIGEN_UNROLL_LOOP
+ for (Index l = 0; l < d; l++) {
+ Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+ Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(private_scan[ai], &accum);
+ accumulator.reduce(private_scan[bi], &accum);
+ private_scan[bi] = accumulator.finalize(accum);
+ }
+ private_offset *= 2;
+ }
+ scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+ private_scan[PacketSize - 1 + packetIndex];
+ private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+ // traverse down tree & build scan
+ EIGEN_UNROLL_LOOP
+ for (Index d = 1; d < PacketSize; d *= 2) {
+ private_offset >>= 1;
+ EIGEN_UNROLL_LOOP
+ for (Index l = 0; l < d; l++) {
+ Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+ Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(private_scan[ai], &accum);
+ accumulator.reduce(private_scan[bi], &accum);
+ private_scan[ai] = private_scan[bi];
+ private_scan[bi] = accumulator.finalize(accum);
+ }
+ }
+ }
+
+ Index offset = 1;
+ // build sum in place up the tree
+ for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (local_id < d) {
+ Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+ Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(scratch_ptr[ai], &accum);
+ accumulator.reduce(scratch_ptr[bi], &accum);
+ scratch_ptr[bi] = accumulator.finalize(accum);
+ }
+ offset *= 2;
+ }
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ // next step optimisation
+ if (local_id == 0) {
+ if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+ const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+ scanParameters.non_scan_size +
+ group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+ block_id;
+ tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+ }
+ // clear the last element
+ scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+ }
+ // traverse down tree & build scan
+ for (Index d = 1; d < scratch_stride; d *= 2) {
+ offset >>= 1;
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (local_id < d) {
+ Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+ Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(scratch_ptr[ai], &accum);
+ accumulator.reduce(scratch_ptr[bi], &accum);
+ scratch_ptr[ai] = scratch_ptr[bi];
+ scratch_ptr[bi] = accumulator.finalize(accum);
+ }
+ }
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ // This for loop must be 2
+ EIGEN_UNROLL_LOOP
+ for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < PacketSize; i++) {
+ CoeffReturnType accum = private_scan[packetIndex + i];
+ accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+ private_scan[packetIndex + i] = accumulator.finalize(accum);
+ }
+ }
+ first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+ if (inclusive) {
+ accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+ private_scan[0] = accumulator.finalize(inclusive_scan);
+ }
+ });
+ next_elements = 0;
+ // right the first set of private param
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+ scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size)) {
+ Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+ out_ptr[global_id] = private_scan[private_id];
+ }
+ next_elements += scanParameters.scan_stride;
+ }
+ } // end for loop
+ }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+ InAccessor in_accessor;
+ OutAccessor out_accessor;
+ const ScanParameters<Index> scanParameters;
+ Op accumulator;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+ OutAccessor out_accessor_,
+ const ScanParameters<Index> scanParameters_,
+ Op accumulator_)
+ : in_accessor(in_accessor_),
+ out_accessor(out_accessor_),
+ scanParameters(scanParameters_),
+ accumulator(accumulator_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto in_ptr = in_accessor.get_pointer();
+ auto out_ptr = out_accessor.get_pointer();
+
+ for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+ Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+ Index tmp = data_offset % scanParameters.panel_threads;
+ const Index panel_id = data_offset / scanParameters.panel_threads;
+ const Index group_id = tmp / scanParameters.group_threads;
+ tmp = tmp % scanParameters.group_threads;
+ const Index block_id = tmp / scanParameters.block_threads;
+ const Index local_id = tmp % scanParameters.block_threads;
+
+ // the actual panel size is scan_size * non_scan_size.
+ // elements_per_panel is roundup to power of 2 for binary tree
+ const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+ const Index group_offset = group_id * scanParameters.non_scan_stride;
+ // This will be effective when the size is bigger than elements_per_block
+ const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+ const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+ const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+ const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+ const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+ CoeffReturnType adjust_val = in_ptr[in_id];
+
+ Index next_elements = 0;
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+ scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size)) {
+ CoeffReturnType accum = adjust_val;
+ accumulator.reduce(out_ptr[global_id], &accum);
+ out_ptr[global_id] = accumulator.finalize(accum);
+ }
+ next_elements += scanParameters.scan_stride;
+ }
+ }
+ }
+};
+
+template <typename Index>
+struct ScanInfo {
+ const Index &total_size;
+ const Index &scan_size;
+ const Index &panel_size;
+ const Index &non_scan_size;
+ const Index &scan_stride;
+ const Index &non_scan_stride;
+
+ Index max_elements_per_block;
+ Index block_size;
+ Index panel_threads;
+ Index group_threads;
+ Index block_threads;
+ Index elements_per_group;
+ Index elements_per_block;
+ Index loop_range;
+ Index global_range;
+ Index local_range;
+ const Eigen::SyclDevice &dev;
+ EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+ const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+ const Eigen::SyclDevice &dev_)
+ : total_size(total_size_),
+ scan_size(scan_size_),
+ panel_size(panel_size_),
+ non_scan_size(non_scan_size_),
+ scan_stride(scan_stride_),
+ non_scan_stride(non_scan_stride_),
+ dev(dev_) {
+ // must be power of 2
+ local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+ Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+ max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+ elements_per_group =
+ dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+ const Index elements_per_panel = elements_per_group * non_scan_size;
+ elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+ panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+ group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+ block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+ block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+ const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+ const Index max_threads = panel_threads * panel_size;
+#endif
+ global_range = roundUp(max_threads, local_range);
+ loop_range = Index(
+ std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+ }
+ inline ScanParameters<Index> get_scan_parameter() {
+ return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+ group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+ }
+ inline cl::sycl::nd_range<1> get_thread_range() {
+ return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+ EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+ Reducer &accumulator, const Index total_size,
+ const Index scan_size, const Index panel_size,
+ const Index non_scan_size, const Index scan_stride,
+ const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+ auto scan_info =
+ ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+ typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+ AdjustFuctor;
+ dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+ scan_info.max_elements_per_block,
+ scan_info.get_scan_parameter(), accumulator);
+ }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+ template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+ EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+ const Index total_size, const Index scan_size, const Index panel_size,
+ const Index non_scan_size, const Index scan_stride,
+ const Index non_scan_stride, const bool inclusive,
+ const Eigen::SyclDevice &dev) {
+ auto scan_info =
+ ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+ const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+ const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+ typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+ dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+ in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+ scan_info.get_scan_parameter(), accumulator, inclusive);
+
+ if (scan_info.block_size > 1) {
+ ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+ tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+ non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+ SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+ tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+ non_scan_stride, dev);
+ }
+ dev.deallocate_temp(temp_pointer);
+ }
+};
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice> {
+ typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::Storage Storage;
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ void operator()(Self &self, EvaluatorPointerType data) {
+ const Index total_size = internal::array_prod(self.dimensions());
+ const Index scan_size = self.size();
+ const Index scan_stride = self.stride();
+ // this is the scan op (can be sum or ...)
+ auto accumulator = self.accumulator();
+ auto inclusive = !self.exclusive();
+ auto consume_dim = self.consume_dim();
+ auto dev = self.device();
+
+ auto dims = self.inner().dimensions();
+
+ Index non_scan_size = 1;
+ Index panel_size = 1;
+ if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < consume_dim; i++) {
+ non_scan_size *= dims[i];
+ }
+ for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+ panel_size *= dims[i];
+ }
+ } else {
+ for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+ non_scan_size *= dims[i];
+ }
+ for (int i = consume_dim - 1; i >= 0; i--) {
+ panel_size *= dims[i];
+ }
+ }
+ const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+ auto eval_impl = self.inner();
+ TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+ eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+ inclusive, dev);
+ }
+};
+} // namespace Eigen
+
+#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
deleted file mode 100644
index 7b8bd2df7..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-
-#ifdef EIGEN_USE_SYCL
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeGlobalPointer {
- typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
-};
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeLocalPointer {
- typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
-};
-
-
-namespace Eigen {
- template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
- template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
-namespace internal {
-
-#ifdef __SYCL_DEVICE_ONLY__
-template<typename A, typename B> struct TypeConversion {
- template<typename T>
- static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p);
- template<typename T>
- static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p);
-
- template<typename T>
- static A* get_address_space_pointer(T* p);
- typedef decltype(get_address_space_pointer(B())) type;
-};
-
-#endif
-}
-namespace TensorSycl {
-namespace internal {
-
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
-/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
- struct NoOP;
-
-template<bool IsConst, typename T> struct GetType{
- typedef const T Type;
-};
-template<typename T> struct GetType<false, T>{
- typedef T Type;
-};
-
-template <bool Conds, size_t X , size_t Y > struct ValueCondition {
- static constexpr size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X, Y> {
- static constexpr size_t Res =Y;
-};
-
-}
-}
-}
-
-// tuple construction
-#include "TensorSyclTuple.h"
-
-// counting number of leaf at compile time
-#include "TensorSyclLeafCount.h"
-
-// The index PlaceHolder takes the actual expression and replaces the actual
-// data on it with the place holder. It uses the same pre-order expression tree
-// traverse as the leaf count in order to give the right access number to each
-// node in the expression
-#include "TensorSyclPlaceHolderExpr.h"
-
-// creation of an accessor tuple from a tuple of SYCL buffers
-#include "TensorSyclExtractAccessor.h"
-
-// this is used to change the address space type in tensor map for GPU
-#include "TensorSyclConvertToDeviceExpression.h"
-
-// this is used to extract the functors
-#include "TensorSyclExtractFunctors.h"
-
-// this is used to create tensormap on the device
-// this is used to construct the expression on the device
-#include "TensorSyclExprConstructor.h"
-
-/// this is used for extracting tensor reduction
-#include "TensorReductionSycl.h"
-
-// TensorArgMaxSycl.h
-#include "TensorArgMaxSycl.h"
-
-/// this is used for extracting tensor convolution
-#include "TensorConvolutionSycl.h"
-
-// kernel execution using fusion
-#include "TensorSyclRun.h"
-//sycl functors
-#include "TensorSyclFunctors.h"
-
-#include "TensorContractionSycl.h"
-
-#endif // end of EIGEN_USE_SYCL
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
deleted file mode 100644
index d6ac7b91f..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ /dev/null
@@ -1,205 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- * Conversion from host pointer to device pointer
- * inside leaf nodes of the expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct ConvertToDeviceExpression
-/// \brief This struct is used to convert the MakePointer in the host expression
-/// to the MakeGlobalPointer for the device expression. For the leafNodes
-/// containing the pointer. This is due to the fact that the address space of
-/// the pointer T* is different on the host and the device.
-template <typename Expr>
-struct ConvertToDeviceExpression;
-
-template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
-struct NonOpConversion{
- typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
-};
-
-
-template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
-struct DeviceConvertor{
- typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
-};
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorMap
-#define TENSORMAPCONVERT(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
-};
-
-TENSORMAPCONVERT(const)
-TENSORMAPCONVERT()
-#undef TENSORMAPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
-#define CATEGORYCONVERT(CVQual)\
-template <template<class, class...> class Category, typename OP, typename... subExprs>\
-struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
- typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
-};
-CATEGORYCONVERT(const)
-CATEGORYCONVERT()
-#undef CATEGORYCONVERT
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorCwiseSelectOp
-#define SELECTOPCONVERT(CVQual, Res)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
-: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
-SELECTOPCONVERT(const, true)
-SELECTOPCONVERT(, false)
-#undef SELECTOPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is const AssingOP
-#define ASSIGNCONVERT(CVQual, Res)\
-template <typename LHSExpr, typename RHSExpr>\
-struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
-: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
-
-ASSIGNCONVERT(const, true)
-ASSIGNCONVERT(, false)
-#undef ASSIGNCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorEvalToOp
-#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
-: DeviceConvertor<ExprNode, Res, Expr>{};
-
-
-KERNELBROKERCONVERT(const, true, TensorEvalToOp)
-KERNELBROKERCONVERT(, false, TensorEvalToOp)
-#undef KERNELBROKERCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
-#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
- typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-
-
-// TensorForcedEvalOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
-
-// TensorLayoutSwapOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
-
-//TensorIndexTupleOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
-#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
- typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
-};
-
-KERNELBROKERCONVERTREDUCTION(const)
-KERNELBROKERCONVERTREDUCTION()
-#undef KERNELBROKERCONVERTREDUCTION
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr>\
-struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
- typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
-};
-
-KERNELBROKERCONVERTTUPLEREDUCTION(const)
-KERNELBROKERCONVERTTUPLEREDUCTION()
-#undef KERNELBROKERCONVERTTUPLEREDUCTION
-
-//TensorSlicingOp
-#define KERNELBROKERCONVERTSLICEOP(CVQual)\
-template<typename StartIndices, typename Sizes, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTSLICEOP(const)
-KERNELBROKERCONVERTSLICEOP()
-#undef KERNELBROKERCONVERTSLICEOP
-
-//TensorStridingSlicingOp
-#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTERSLICESTRIDEOP(const)
-KERNELBROKERCONVERTERSLICESTRIDEOP()
-#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
-#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
- typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTCHIPPINGOP(const)
-KERNELBROKERCONVERTCHIPPINGOP()
-#undef KERNELBROKERCONVERTCHIPPINGOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
-#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\
- typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-KERNELBROKERCONVERTIMAGEPATCHOP(const)
-KERNELBROKERCONVERTIMAGEPATCHOP()
-#undef KERNELBROKERCONVERTIMAGEPATCHOP
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp
-#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\
- typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-KERNELBROKERCONVERTVOLUMEPATCHOP(const)
-KERNELBROKERCONVERTVOLUMEPATCHOP()
-#undef KERNELBROKERCONVERTVOLUMEPATCHOP
-
-} // namespace internal
-} // namespace TensorSycl
-} // namespace Eigen
-
-#endif // UNSUPPORTED_EIGEN_CXX1
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
deleted file mode 100644
index 67003daf5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ /dev/null
@@ -1,514 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExprConstructor.h
- *
- * \brief:
- * This file re-create an expression on the SYCL device in order
- * to use the original tensor evaluator.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-template <typename Expr, typename Dims>
-struct DeviceFixedSizeTensor;
-
-template <typename Expr, typename std::ptrdiff_t... Indices>
-struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
- template<typename Data>
- static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
-};
-/// this class is used by EvalToOp in order to create an lhs expression which is
-/// a pointer from an accessor on device-only buffer
-template <typename PtrType, size_t N, typename... Params>
-struct EvalToLHSConstructor {
- PtrType expr;
- EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
-};
-
-/// struct ExprConstructor is used to reconstruct the expression on the device and
-/// recreate the expression with MakeGlobalPointer containing the device address
-/// space for the TensorMap pointers used in eval function.
-/// It receives the original expression type, the functor of the node, the tuple
-/// of accessors, and the device expression type to re-instantiate the
-/// expression tree for the device
-template <typename OrigExpr, typename IndexExpr, typename... Params>
-struct ExprConstructor;
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAP(CVQual)\
-template <typename T, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
-};
-
-TENSORMAP(const)
-TENSORMAP()
-#undef TENSORMAP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
- : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
-};
-
-TENSORMAPFIXEDSIZE(const)
-TENSORMAPFIXEDSIZE()
-#undef TENSORMAPFIXEDSIZE
-
-#define UNARYCATEGORY(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
- typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
- my_type rhsExpr;\
- typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
-};
-
-UNARYCATEGORY(const)
-UNARYCATEGORY()
-#undef UNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorBinaryOp
-#define BINARYCATEGORY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
-typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>, CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
- typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
- typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
- typedef CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
- my_left_type lhsExpr;\
- my_right_type rhsExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
-};
-
-BINARYCATEGORY(const)
-BINARYCATEGORY()
-#undef BINARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseTernaryOp
-#define TERNARYCATEGORY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
-typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
-struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
- typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
- typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
- typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
- typedef CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
- my_arg1_type arg1Expr;\
- my_arg2_type arg2Expr;\
- my_arg3_type arg3Expr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
- : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
-};
-
-TERNARYCATEGORY(const)
-TERNARYCATEGORY()
-#undef TERNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseSelectOp
-#define SELECTOP(CVQual)\
-template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
-struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
- typedef ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
- typedef ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
- typedef ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
- typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
- my_if_type ifExpr;\
- my_then_type thenExpr;\
- my_else_type elseExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
-};
-
-SELECTOP(const)
-SELECTOP()
-#undef SELECTOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// const TensorAssignOp
-#define ASSIGN(CVQual)\
-template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
- typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
- typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
- typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type> Type;\
- my_left_type lhsExpr;\
- my_right_type rhsExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
- };
-
- ASSIGN(const)
- ASSIGN()
- #undef ASSIGN
-
- /// specialisation of the \ref ExprConstructor struct when the node type is
- /// const TensorAssignOp
- #define CONVERSIONEXPRCONST(CVQual)\
- template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
- struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
- typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
- typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\
- my_nested_type nestedExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
- };
-
- CONVERSIONEXPRCONST(const)
- CONVERSIONEXPRCONST()
- #undef CONVERSIONEXPRCONST
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorEvalToOp /// 0 here is the output number in the buffer
-#define EVALTO(CVQual)\
-template <typename OrigExpr, typename Expr, typename... Params>\
-struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
- typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
- typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
- typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
- my_expr_type nestedExpression;\
- EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef TensorForcedEvalOp<OrigExpr> XprType;\
- typedef CVQual TensorMap<\
- Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
- Eigen::internal::traits<XprType>::Layout, \
- MakeGlobalPointer\
- > Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-#define TENSORCUSTOMUNARYOP(CVQual)\
-template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
- typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
- typedef CVQual TensorMap<\
- Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
- Eigen::internal::traits<XprType>::Layout, \
- MakeGlobalPointer\
- > Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-TENSORCUSTOMUNARYOP(const)
-TENSORCUSTOMUNARYOP()
-#undef TENSORCUSTOMUNARYOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
- static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
- typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
- NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-SYCLREDUCTIONEXPR(const)
-SYCLREDUCTIONEXPR()
-#undef SYCLREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
-/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
-#define SYCLTUPLEREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
- static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
- static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
-static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
- typedef CVQual TensorMap<\
- Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
- Layout,\
- MakeGlobalPointer\
- > XprType;\
- typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
- static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
- typedef array<Index, NumDims> StrideDims;\
- typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
- fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
- }\
-};
-
-SYCLTUPLEREDUCTIONEXPR(const)
-SYCLTUPLEREDUCTIONEXPR()
-#undef SYCLTUPLEREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
-#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
-template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
-struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
-CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
- typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
- static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
- typedef CVQual TensorMap<\
- Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
- Eigen::internal::traits<XprTyp>::Layout, \
- MakeGlobalPointer\
- > Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-//TensorContractionOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
-//TensorConvolutionOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
-//TensorCustomBinaryOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
-#undef SYCLCONTRACTCONVCUSBIOPS
-
-//TensorSlicingOp
-#define SYCLSLICEOPEXPR(CVQual)\
-template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
-};
-
-SYCLSLICEOPEXPR(const)
-SYCLSLICEOPEXPR()
-#undef SYCLSLICEOPEXPR
-
-//TensorStridingSlicingOp
-#define SYCLSLICESTRIDEOPEXPR(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
-};
-
-SYCLSLICESTRIDEOPEXPR(const)
-SYCLSLICESTRIDEOPEXPR()
-#undef SYCLSLICESTRIDEOPEXPR
-
-//TensorReshapingOp and TensorShufflingOp
-#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
-};
-
-// TensorReshapingOp
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-// TensorShufflingOp
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
-#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
-
-//TensorPaddingOp
-#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
-};
-
-//TensorPaddingOp
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
-#undef SYCLPADDINGOPEXPRCONST
-
-// TensorChippingOp
-#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
-template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
-};
-
-SYCLTENSORCHIPPINGOPEXPR(const)
-SYCLTENSORCHIPPINGOPEXPR()
-#undef SYCLTENSORCHIPPINGOPEXPR
-
-// TensorImagePatchOp
-#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\
- funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \
- funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\
-};
-
-SYCLTENSORIMAGEPATCHOPEXPR(const)
-SYCLTENSORIMAGEPATCHOPEXPR()
-#undef SYCLTENSORIMAGEPATCHOPEXPR
-
-// TensorVolumePatchOp
-#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\
- funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \
- funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \
- funcD.m_padding_type, funcD.m_padding_value ){\
- }\
-};
-
-SYCLTENSORVOLUMEPATCHOPEXPR(const)
-SYCLTENSORVOLUMEPATCHOPEXPR()
-#undef SYCLTENSORVOLUMEPATCHOPEXPR
-
-// TensorLayoutSwapOp and TensorIndexTupleOp
-#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
-template<typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
-};
-
-//TensorLayoutSwapOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
-//TensorIndexTupleOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
-
-#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
-
-/// template deduction for \ref ExprConstructor struct
-template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
-auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
- -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
- return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
deleted file mode 100644
index 3c74d1696..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ /dev/null
@@ -1,310 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExtractAccessor.h
- *
- * \brief:
- * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
- * buffers as an input. Using pre-order tree traversal, ExtractAccessor
- * recursively calls itself for its children in the expression tree. The
- * leaf node in the PlaceHolder expression is nothing but a container preserving
- * the order of the actual data in the tuple of sycl buffer. By invoking the
- * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
- * buffer in the tuple of buffers. This accessor is then added as an Nth
- * element in the tuple of accessors. In this case we preserve the order of data
- * in the expression tree.
- *
- * This is the specialisation of extract accessor method for different operation
- * type in the PlaceHolder expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
-
-/// struct ExtractAccessor: Extract Accessor Class is used to extract the
-/// accessor from a buffer.
-/// Depending on the type of the leaf node we can get a read accessor or a
-/// read_write accessor
-template <typename Evaluator>
-struct ExtractAccessor;
-
-struct AccessorConstructor{
- template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
-
- template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
-
- template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
-
- template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
-#define SYCLUNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLUNARYCATEGORYEXTACC(const)
-SYCLUNARYCATEGORYEXTACC()
-#undef SYCLUNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-#define SYCLBINARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
-SYCLBINARYCATEGORYEXTACC(const)
-SYCLBINARYCATEGORYEXTACC()
-#undef SYCLBINARYCATEGORYEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseTernaryOp
-#define SYCLTERNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
-};
-
-SYCLTERNARYCATEGORYEXTACC(const)
-SYCLTERNARYCATEGORYEXTACC()
-#undef SYCLTERNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseSelectOp. This is a special case where there is no OP
-#define SYCLSELECTOPEXTACC(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
-};
-
-SYCLSELECTOPEXTACC(const)
-SYCLSELECTOPEXTACC()
-#undef SYCLSELECTOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-#define SYCLTENSORASSIGNOPEXTACC(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
- SYCLTENSORASSIGNOPEXTACC(const)
- SYCLTENSORASSIGNOPEXTACC()
- #undef SYCLTENSORASSIGNOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
-#define TENSORMAPEXPR(CVQual, ACCType)\
-template <typename PlainObjectType, int Options_, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
-};
-
-TENSORMAPEXPR(const, cl::sycl::access::mode::read)
-TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-#define SYCLFORCEDEVALEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLFORCEDEVALEXTACC(const)
-SYCLFORCEDEVALEXTACC()
-#undef SYCLFORCEDEVALEXTACC
-
-//TensorCustomUnaryOp
-#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
-template <typename CustomUnaryFunc, typename XprType, typename Dev >\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-
-SYCLCUSTOMUNARYOPEXTACC(const)
-SYCLCUSTOMUNARYOPEXTACC()
-#undef SYCLCUSTOMUNARYOPEXTACC
-
-//TensorCustomBinaryOp
-#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLCUSTOMBINARYOPEXTACC(const)
-SYCLCUSTOMBINARYOPEXTACC()
-#undef SYCLCUSTOMBIBARYOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-#define SYCLEVALTOEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
- RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
-};
-
-SYCLEVALTOEXTACC(const)
-SYCLEVALTOEXTACC()
-#undef SYCLEVALTOEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
-template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-// TensorReductionOp
-SYCLREDUCTIONEXTACC(const,TensorReductionOp)
-SYCLREDUCTIONEXTACC(,TensorReductionOp)
-
-// TensorTupleReducerOp
-SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
-SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
-#undef SYCLREDUCTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
-#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
- struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-//TensorContractionOp
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
-//TensorConvolutionOp
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp.
-#define SYCLSLICEOPEXTACC(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
- RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICEOPEXTACC(const)
-SYCLSLICEOPEXTACC()
-#undef SYCLSLICEOPEXTACC
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorStridingSlicingOp.
-#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICESTRIDEOPEXTACC(const)
-SYCLSLICESTRIDEOPEXTACC()
-#undef SYCLSLICESTRIDEOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorChippingOp.
-#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORCHIPPINGOPEXTACC(const)
-SYCLTENSORCHIPPINGOPEXTACC()
-#undef SYCLTENSORCHIPPINGOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorImagePatchOp.
-#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORIMAGEPATCHOPEXTACC(const)
-SYCLTENSORIMAGEPATCHOPEXTACC()
-#undef SYCLTENSORIMAGEPATCHOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorVolumePatchOp.
-#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORVOLUMEPATCHOPEXTACC(const)
-SYCLTENSORVOLUMEPATCHOPEXTACC()
-#undef SYCLTENSORVOLUMEPATCHOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorLayoutSwapOp, TensorIndexTupleOp
-#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
-template<typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-// TensorLayoutSwapOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
-//TensorIndexTupleOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
-
-#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
-
-/// template deduction for \ref ExtractAccessor
-template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
- return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
deleted file mode 100644
index 09407e53e..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ /dev/null
@@ -1,467 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclextractFunctors.h
- *
- * \brief:
- * Used to extract all the functors allocated to each node of the expression
-*tree.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// struct FunctorExtractor: This struct is used to extract the functors
-/// constructed on
-/// the host-side, to pack them and reuse them in reconstruction of the
-/// expression on the device.
-/// We have to do that as in Eigen the functors are not stateless so we cannot
-/// re-instantiate them on the device.
-/// We have to pass instantiated functors to the device.
-// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
-#define DEFALTACTION(Evaluator)\
-typedef typename Evaluator::Dimensions Dimensions;\
-const Dimensions m_dimensions;\
-EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
-
-template <typename Evaluator> struct FunctorExtractor{
- DEFALTACTION(Evaluator)
-};
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
-///TensorConversionOp
-#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
-template <typename ArgType1, typename ArgType2, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
- FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
- : subExpr(expr.impl()) {}\
-};
-
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
-#undef SYCLEXTRFUNCCONVERSION
-
-#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
-struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
-FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
-};
-
-SYCLEXTRTENSORMAPFIXEDSIZE(const)
-SYCLEXTRTENSORMAPFIXEDSIZE()
-#undef SYCLEXTRTENSORMAPFIXEDSIZE
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
-#define SYCLEXTRFUNCUNARY(CVQual)\
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()), func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCUNARY(const)
-SYCLEXTRFUNCUNARY()
-#undef SYCLEXTRFUNCUNARY
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseBinaryOp
-#define SYCLEXTRFUNCBIINARY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCBIINARY(const)
-SYCLEXTRFUNCBIINARY()
-#undef SYCLEXTRFUNCBIINARY
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
-#define SYCLEXTRFUNCTERNARY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
- FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
- FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
- : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCTERNARY(const)
-SYCLEXTRFUNCTERNARY()
-#undef SYCLEXTRFUNCTERNARY
-
-
-
-//TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different
-//from the UnaryCategory and it is similar to the general FunctorExtractor.
-/// specialisation of TensorCustomOp
-#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
-template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
- typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
- DEFALTACTION(Evaluator)\
-};
-//TensorCustomUnaryOp
-SYCLEXTRFUNCCUSTOMUNARYOP(const)
-SYCLEXTRFUNCCUSTOMUNARYOP()
-#undef SYCLEXTRFUNCCUSTOMUNARYOP
-
-//TensorCustomBinaryOp
-#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
-template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
- typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
- DEFALTACTION(Evaluator)\
-};
-//TensorCustomBinaryOp
-SYCLEXTRFUNCCUSTOMBIBARYOP(const)
-SYCLEXTRFUNCCUSTOMBIBARYOP()
-#undef SYCLEXTRFUNCCUSTOMBIBARYOP
-
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCSELECTOP(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
- FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
- FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
- : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
-};
-
-SYCLEXTRFUNCSELECTOP(const)
-SYCLEXTRFUNCSELECTOP()
-#undef SYCLEXTRFUNCSELECTOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
-};
-SYCLEXTRFUNCASSIGNOP(const)
-SYCLEXTRFUNCASSIGNOP()
-#undef SYCLEXTRFUNCASSIGNOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node types are
-/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
-template <typename Expr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\
- : xprExpr(expr.impl()) {}\
-};
-//TensorEvalToOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
-// TensorLayoutSwapOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
-// TensorIndexTupleOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
-
-#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
-
-template<typename Dim, size_t NumOutputDim> struct DimConstr {
-template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
-};
-
-template<typename Dim> struct DimConstr<Dim, 0> {
- template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
-};
-//TensorReductionOp
-#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
- typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
- typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
- : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
-};
-SYCLEXTRFUNCREDUCTIONOP(const)
-SYCLEXTRFUNCREDUCTIONOP()
-#undef SYCLEXTRFUNCREDUCTIONOP
-
-//TensorTupleReducerOp
-#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
-template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
- struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
- typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
- static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
- typedef typename Evaluator::StrideDims StrideDims;\
- typedef typename Evaluator::Index Index;\
- typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
- const Dimensions m_dimensions;\
- const Index m_return_dim;\
- const StrideDims m_strides;\
- const Index m_stride_mod;\
- const Index m_stride_div;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\
- EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\
- EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\
- EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\
- FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
- : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
- m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
-};
-
-SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
-SYCLEXTRFUNCTUPLEREDUCTIONOP()
-#undef SYCLEXTRFUNCTUPLEREDUCTIONOP
-
-//TensorContractionOp and TensorConvolutionOp
-#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
- typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
- typedef typename Evaluator::Dimensions Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
- : m_dimensions(expr.dimensions()) {}\
-};
-
-//TensorContractionOp
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
-//TensorConvolutionOp
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
-#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCTSLICEOP(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_offsets;\
- const Sizes m_dimensions;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
- EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
-};
-
-SYCLEXTRFUNCTSLICEOP(const)
-SYCLEXTRFUNCTSLICEOP()
-#undef SYCLEXTRFUNCTSLICEOP
-
-//TensorStridingSlicingOp
-#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_startIndices;\
- const StopIndices m_stopIndices;\
- const Strides m_strides;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\
-};
-
-SYCLEXTRFUNCTSLICESTRIDEOP(const)
-SYCLEXTRFUNCTSLICESTRIDEOP()
-#undef SYCLEXTRFUNCTSLICESTRIDEOP
-
-// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
-#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
-};
-
-//TensorReshapingOp
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
-
-//TensorShufflingOp
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
-#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
- const Scalar m_scalar_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\
-};
-
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
-#undef PADDINGOPFUNCEXT
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
-/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
-#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
-template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const Param func;\
- FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
-};
-
-// TensorConcatenationOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
-#undef SYCLEXTRFUNCCONTRACTCONCAT
-
-//TensorChippingOp
-#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
- const DenseIndex m_dim;\
- const DenseIndex m_offset;\
- EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
- EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
- : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
-};
-
-SYCLEXTRFUNCCHIPPINGOP(const)
-SYCLEXTRFUNCCHIPPINGOP()
-#undef SYCLEXTRFUNCCHIPPINGOP
-
-//TensorImagePatchOp
-#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
-typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\
-FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-const DenseIndex m_patch_rows;\
-const DenseIndex m_patch_cols;\
-const DenseIndex m_row_strides;\
-const DenseIndex m_col_strides;\
-const DenseIndex m_in_row_strides;\
-const DenseIndex m_in_col_strides;\
-const DenseIndex m_row_inflate_strides;\
-const DenseIndex m_col_inflate_strides;\
-const bool m_padding_explicit;\
-const DenseIndex m_padding_top;\
-const DenseIndex m_padding_bottom;\
-const DenseIndex m_padding_left;\
-const DenseIndex m_padding_right;\
-const PaddingType m_padding_type;\
-const typename Self::Scalar m_padding_value;\
-FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
-: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
- m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
- m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
- m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\
- m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\
- m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
- m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\
- m_padding_value(expr.xpr().padding_value()){}\
-};
-
-SYCLEXTRFUNCIMAGEPATCHOP(const)
-SYCLEXTRFUNCIMAGEPATCHOP()
-#undef SYCLEXTRFUNCIMAGEPATCHOP
-
-/// TensorVolumePatchOp
-#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\
-typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\
-FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-const DenseIndex m_patch_planes;\
-const DenseIndex m_patch_rows;\
-const DenseIndex m_patch_cols;\
-const DenseIndex m_plane_strides;\
-const DenseIndex m_row_strides;\
-const DenseIndex m_col_strides;\
-const DenseIndex m_in_plane_strides;\
-const DenseIndex m_in_row_strides;\
-const DenseIndex m_in_col_strides;\
-const DenseIndex m_plane_inflate_strides;\
-const DenseIndex m_row_inflate_strides;\
-const DenseIndex m_col_inflate_strides;\
-const bool m_padding_explicit;\
-const DenseIndex m_padding_top_z;\
-const DenseIndex m_padding_bottom_z;\
-const DenseIndex m_padding_top;\
-const DenseIndex m_padding_bottom;\
-const DenseIndex m_padding_left;\
-const DenseIndex m_padding_right;\
-const PaddingType m_padding_type;\
-const typename Self::Scalar m_padding_value;\
-FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
-: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
- m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
- m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
- m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\
- m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\
- m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \
- m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
- m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\
-};
-SYCLEXTRFUNCVOLUMEPATCHOP(const)
-SYCLEXTRFUNCVOLUMEPATCHOP()
-#undef SYCLEXTRFUNCVOLUMEPATCHOP
-
-
-/// template deduction function for FunctorExtractor
-template <typename Evaluator>
-auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
- return FunctorExtractor<Evaluator>(evaluator);
-}
-} // namespace internal
-} // namespace TensorSycl
-} // namespace Eigen
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
deleted file mode 100644
index a447c3f88..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ /dev/null
@@ -1,248 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
- OP op;
- OutputAccessor aOut;
- ptrdiff_t out_offset;
- InputAccessor aI;
- LocalAccessor scratch;
- size_t length, local;
- GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- size_t globalid = itemID.get_global(0);
- size_t localid = itemID.get_local(0);
- /* All threads collectively read from global memory into local.
- * The barrier ensures all threads' IO is resolved before
- * execution continues (strictly speaking, all threads within
- * a single work-group - there is no co-ordination between
- * work-groups, only work-items). */
- if (globalid < length) {
- scratch[localid] = aI[globalid];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- /* Apply the reduction operation between the current local
- * id and the one on the other half of the vector. */
- if (globalid < length) {
- auto min = (length < local) ? length : local;
- for (size_t offset = min / 2; offset > 0; offset /= 2) {
- if (localid < offset) {
- auto accum = op.initialize();
- op.reduce(scratch[localid], &accum);
- op.reduce(scratch[localid + offset], &accum);
- op.finalize(accum);
- scratch[localid]=accum;
- //scratch[localid] += scratch[localid + offset];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- }
- /* The final result will be stored in local id 0. */
- if (localid == 0) {
- aI[itemID.get_group(0)] = scratch[localid];
- if((length<=local) && globalid ==0){
- auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
- aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
- }
- }
- }
- }
-
- };
-
-/// ReductionFunctor
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
- :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
- }
- }
- private:
- write_accessor output_accessor;
- ptrdiff_t out_offset;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
-};
-
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
-class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
- typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
- ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
- Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
- :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
- }
- }
- private:
- write_accessor output_accessor;
- ptrdiff_t out_offset;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
- Index num_values_to_reduce;
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
-
- tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
- : static_cast<CoeffReturnType>(op.initialize());
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
- reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum;
-
- }
- }
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
- auto scale = (rng*red_factor) + remaining;
-
- tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
- :static_cast<CoeffReturnType>(op.initialize())/scale;
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum/scale;
-
- }
- }
-};
-
-}
-}
-}
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
deleted file mode 100644
index 234580c7c..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclLeafCount.h
- *
- * \brief:
- * The leaf count used the pre-order expression tree traverse in order to name
- * count the number of leaf nodes in the expression
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \brief LeafCount used to counting terminal nodes. The total number of
-/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
-/// of the leaf node in a expression tree at compile time.
-template <typename Expr>
-struct LeafCount;
-
-template<typename... Args> struct CategoryCount;
-
-template<> struct CategoryCount<>
-{
- static const size_t Count =0;
-};
-
-template<typename Arg, typename... Args>
-struct CategoryCount<Arg,Args...>{
- static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-#define SYCLTENSORMAPLEAFCOUNT(CVQual)\
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
-struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
- static const size_t Count =1;\
-};
-
-SYCLTENSORMAPLEAFCOUNT(const)
-SYCLTENSORMAPLEAFCOUNT()
-#undef SYCLTENSORMAPLEAFCOUNT
-
-// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
-#define SYCLCATEGORYLEAFCOUNT(CVQual)\
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
-struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-
-SYCLCATEGORYLEAFCOUNT(const)
-SYCLCATEGORYLEAFCOUNT()
-#undef SYCLCATEGORYLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-#define SYCLSELECTOPLEAFCOUNT(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-
-SYCLSELECTOPLEAFCOUNT(const)
-SYCLSELECTOPLEAFCOUNT()
-#undef SYCLSELECTOPLEAFCOUNT
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
-#define SYCLLEAFCOUNTASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr>\
-struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-
-SYCLLEAFCOUNTASSIGNOP(const)
-SYCLLEAFCOUNTASSIGNOP()
-#undef SYCLLEAFCOUNTASSIGNOP
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
- static const size_t Count =1;\
-};
-
-SYCLFORCEDEVALLEAFCOUNT(const)
-SYCLFORCEDEVALLEAFCOUNT()
-#undef SYCLFORCEDEVALLEAFCOUNT
-
-#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
-template <typename CustomUnaryFunc, typename XprType>\
-struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
-static const size_t Count =1;\
-};
-
-SYCLCUSTOMUNARYOPLEAFCOUNT(const)
-SYCLCUSTOMUNARYOPLEAFCOUNT()
-#undef SYCLCUSTOMUNARYOPLEAFCOUNT
-
-
-#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
-static const size_t Count =1;\
-};
-SYCLCUSTOMBINARYOPLEAFCOUNT( const)
-SYCLCUSTOMBINARYOPLEAFCOUNT()
-#undef SYCLCUSTOMBINARYOPLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
-template <typename Expr>\
-struct LeafCount<CVQual ExprNode<Expr> > {\
- static const size_t Count = Num + CategoryCount<Expr>::Count;\
-};
-
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
-
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
-
-#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
- static const size_t Count =1;\
-};
-
-// TensorReductionOp
-REDUCTIONLEAFCOUNT(const,TensorReductionOp)
-REDUCTIONLEAFCOUNT(,TensorReductionOp)
-
-// tensor Argmax -TensorTupleReducerOp
-REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
-REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
-
-#undef REDUCTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
-#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
- static const size_t Count =1;\
-};
-
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
-#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
-#define SLICEOPLEAFCOUNT(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType>\
-struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
-
-SLICEOPLEAFCOUNT(const)
-SLICEOPLEAFCOUNT()
-#undef SLICEOPLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp
-#define CHIPPINGOPLEAFCOUNT(CVQual)\
-template <DenseIndex DimId, typename XprType>\
-struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
-
-CHIPPINGOPLEAFCOUNT(const)
-CHIPPINGOPLEAFCOUNT()
-#undef CHIPPINGOPLEAFCOUNT
-
-///TensorStridingSlicingOp
-#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
-
-SLICESTRIDEOPLEAFCOUNT(const)
-SLICESTRIDEOPLEAFCOUNT()
-#undef SLICESTRIDEOPLEAFCOUNT
-
-//TensorImagePatchOp
-#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{};
-
-
-TENSORIMAGEPATCHOPLEAFCOUNT(const)
-TENSORIMAGEPATCHOPLEAFCOUNT()
-#undef TENSORIMAGEPATCHOPLEAFCOUNT
-
-// TensorVolumePatchOp
-#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
-
-TENSORVOLUMEPATCHOPLEAFCOUNT(const)
-TENSORVOLUMEPATCHOPLEAFCOUNT()
-#undef TENSORVOLUMEPATCHOPLEAFCOUNT
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
deleted file mode 100644
index 9d5708fc5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ /dev/null
@@ -1,302 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
- *
- * \brief:
- * This is the specialisation of the placeholder expression based on the
- * operation type
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct PlaceHolder
-/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
-/// tree.
-/// PlaceHolder contains the order of the leaf node in the expression tree.
-template <typename Scalar, size_t N>
-struct PlaceHolder {
- static constexpr size_t I = N;
- typedef Scalar Type;
-};
-
-/// \sttruct PlaceHolderExpression
-/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
-/// expression is a copy of expression type in which the TensorMap of the has
-/// been replaced with PlaceHolder.
-template <typename Expr, size_t N>
-struct PlaceHolderExpression;
-
-template<size_t N, typename... Args>
-struct CalculateIndex;
-
-template<size_t N, typename Arg>
-struct CalculateIndex<N, Arg>{
- typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
- typedef utility::tuple::Tuple<ArgType> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2>
-struct CalculateIndex<N, Arg1, Arg2>{
- static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
- typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
- typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
- typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2, typename Arg3>
-struct CalculateIndex<N, Arg1, Arg2, Arg3> {
- static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
- static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
- typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
- typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
- typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
- typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
-};
-
-template<template<class...> class Category , class OP, class TPL>
-struct CategoryHelper;
-
-template<template<class...> class Category , class OP, class ...T >
-struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
- typedef Category<OP, T... > Type;
-};
-
-template<template<class...> class Category , class ...T >
-struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
- typedef Category<T... > Type;
-};
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp
-#define OPEXPRCATEGORY(CVQual)\
-template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
-struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
- typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
-};
-
-OPEXPRCATEGORY(const)
-OPEXPRCATEGORY()
-#undef OPEXPRCATEGORY
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SELECTEXPR(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
- typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
-};
-
-SELECTEXPR(const)
-SELECTEXPR()
-#undef SELECTEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorAssignOp
-#define ASSIGNEXPR(CVQual)\
-template <typename LHSExpr, typename RHSExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
- typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
-};
-
-ASSIGNEXPR(const)
-ASSIGNEXPR()
-#undef ASSIGNEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorMap
-#define TENSORMAPEXPR(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
-};
-
-TENSORMAPEXPR(const)
-TENSORMAPEXPR()
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define CUSTOMUNARYOPEVAL(CVQual)\
-template <typename CustomUnaryFunc, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
-};
-
-CUSTOMUNARYOPEVAL(const)
-CUSTOMUNARYOPEVAL()
-#undef CUSTOMUNARYOPEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define CUSTOMBINARYOPEVAL(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
-};
-
-CUSTOMBINARYOPEVAL(const)
-CUSTOMBINARYOPEVAL()
-#undef CUSTOMBINARYOPEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
-#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
- typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-// TensorEvalToOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
-//TensorLayoutSwapOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
-//TensorIndexTupleOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
-
-#undef EVALTOLAYOUTSWAPINDEXTUPLE
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorChippingOp
-#define CHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
- typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-CHIPPINGOP(const)
-CHIPPINGOP()
-#undef CHIPPINGOP
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp and TensorTupleReducerOp (Argmax)
-#define SYCLREDUCTION(CVQual, ExprNode)\
-template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
- typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
-};
-
-// tensor reduction
-SYCLREDUCTION(const, TensorReductionOp)
-SYCLREDUCTION(, TensorReductionOp)
-
-// tensor Argmax -TensorTupleReducerOp
-SYCLREDUCTION(const, TensorTupleReducerOp)
-SYCLREDUCTION(, TensorTupleReducerOp)
-#undef SYCLREDUCTION
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
- typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
-};
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONPLH
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SLICEOPEXPR(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SLICEOPEXPR(const)
-SLICEOPEXPR()
-#undef SLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPPLH(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SYCLSLICESTRIDEOPPLH(const)
-SYCLSLICESTRIDEOPPLH()
-#undef SYCLSLICESTRIDEOPPLH
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorImagePatchOp
-#define SYCLTENSORIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\
- typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
-};
-
-SYCLTENSORIMAGEPATCHOP(const)
-SYCLTENSORIMAGEPATCHOP()
-#undef SYCLTENSORIMAGEPATCHOP
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorVolumePatchOp
-#define SYCLTENSORVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\
- typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
-};
-
-SYCLTENSORVOLUMEPATCHOP(const)
-SYCLTENSORVOLUMEPATCHOP()
-#undef SYCLTENSORVOLUMEPATCHOP
-
-
-/// template deduction for \ref PlaceHolderExpression struct
-template <typename Expr>
-struct createPlaceHolderExpression {
- static const size_t TotalLeaves = LeafCount<Expr>::Count;
- typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
-};
-
-} // internal
-} // TensorSycl
-} // namespace Eigen
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
deleted file mode 100644
index 29c78184d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Cummins Chris PhD student at The University of Edinburgh.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclRun.h
- *
- * \brief:
- * Schedule_kernel invoke an specialised version of kernel struct. The
- * specialisation is based on the data dimension in sycl buffer
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
- typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-
- typedef typename Expr::Index Index;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
- Index range;
- ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
- auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
- typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
- if (gId < range)
- device_evaluator.evalScalar(gId);
- }
-};
-
-/// The run function in tensor sycl convert the expression tree to a buffer
-/// based expression tree;
-/// creates the expression tree for the device with accessor to buffers;
-/// construct the kernel and submit it to the sycl queue.
-/// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename , typename Dimensions> struct DimensionSize{
- static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
- return dim.TotalSize();
- }
-};
-#define DIMSIZEMACRO(CVQual)\
-template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
- static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
- return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
- }\
-};
-
-DIMSIZEMACRO(const)
-DIMSIZEMACRO()
-#undef DIMSIZEMACRO
-
-
-template <typename Expr, typename Dev>
-void run(Expr &expr, Dev &dev) {
- Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign) {
- typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
- FunctorExpr functors = internal::extractFunctors(evaluator);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // create a tuple of accessors from Evaluator
- typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
- TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
- typename Expr::Index range, GRange, tileSize;
- typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
- dev.parallel_for_setup(total_size, tileSize, range, GRange);
-
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
- , functors, tuple_of_accessors
- ));
- });
- dev.asynchronousExec();
- }
- evaluator.cleanup();
-}
-} // namespace TensorSycl
-} // namespace Eigen
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
deleted file mode 100644
index 5385c7eac..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensroSyclTuple.h
- *
- * \brief:
- * Minimal implementation of std::tuple that can be used inside a SYCL kernel.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-
-namespace utility {
-namespace tuple {
-/// \struct StaticIf
-/// \brief The StaticIf struct is used to statically choose the type based on the
-/// condition.
-template <bool, typename T = void> struct StaticIf;
-/// \brief specialisation of the \ref StaticIf when the condition is true
-template <typename T>
-struct StaticIf<true, T> {
- typedef T type;
-};
-
-/// \struct Tuple
-/// \brief is a fixed-size collection of heterogeneous values
-/// \tparam Ts... - the types of the elements that the tuple stores.
-/// Empty list is supported.
-template <class... Ts>
-struct Tuple {};
-
-/// \brief specialisation of the \ref Tuple class when the tuple has at least
-/// one element.
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-template <class T, class... Ts>
-struct Tuple<T, Ts...> {
- Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
- T head;
- Tuple<Ts...> tail;
-};
-
-///\ struct ElemTypeHolder
-/// \brief ElemTypeHolder class is used to specify the types of the
-/// elements inside the tuple
-/// \tparam size_t the number of elements inside the tuple
-/// \tparam class the tuple class
-template <size_t, class>
-struct ElemTypeHolder;
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is 1
-template <class T, class... Ts>
-struct ElemTypeHolder<0, Tuple<T, Ts...> > {
- typedef T type;
-};
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is bigger than 1. It recursively calls itself to
-/// detect the type of each element in the tuple
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-/// \tparam K is the Kth element in the tuple
-template <size_t k, class T, class... Ts>
-struct ElemTypeHolder<k, Tuple<T, Ts...> > {
- typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
-};
-
-/// get
-/// \brief Extracts the first element from the tuple.
-/// K=0 represents the first element of the tuple. The tuple cannot be empty.
-/// \tparam Ts... are the type of the elements in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
-
-#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
-template <size_t k, class... Ts> \
-typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
-get(CVQual Tuple<Ts...> &t) { \
- static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
- return t.head; \
-}
-
-TERMINATE_CONDS_TUPLE_GET(const)
-TERMINATE_CONDS_TUPLE_GET()
-#undef TERMINATE_CONDS_TUPLE_GET
-/// get
-/// \brief Extracts the Kth element from the tuple.
-///\tparam K is an integer value in [0,sizeof...(Types)).
-/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
-/// \tparam Ts... are the type of the elements in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
-#define RECURSIVE_TUPLE_GET(CVQual) \
-template <size_t k, class T, class... Ts> \
-typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
-get(CVQual Tuple<T, Ts...> &t) { \
- return utility::tuple::get<k - 1>(t.tail); \
-}
-RECURSIVE_TUPLE_GET(const)
-RECURSIVE_TUPLE_GET()
-#undef RECURSIVE_TUPLE_GET
-
-/// make_tuple
-/// \brief Creates a tuple object, deducing the target type from the types of
-/// arguments.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \param args zero or more arguments to construct the tuple from
-/// \return Tuple<Args...>
-template <typename... Args>
-Tuple<Args...> make_tuple(Args... args) {
- return Tuple<Args...>(args...);
-}
-
-/// size
-/// \brief Provides access to the number of elements in a tuple as a
-/// compile-time constant expression.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \return size_t
-template <typename... Args>
-static constexpr size_t size(Tuple<Args...> &) {
- return sizeof...(Args);
-}
-
-/// \struct IndexList
-/// \brief Creates a list of index from the elements in the tuple
-/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
-template <size_t... Is>
-struct IndexList {};
-
-/// \struct RangeBuilder
-/// \brief Collects internal details for generating index ranges [MIN, MAX)
-/// Declare primary template for index range builder
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elements)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder;
-
-// FIXME Doxygen has problems with recursive inheritance
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-/// \brief base Step: Specialisation of the \ref RangeBuilder when the
-/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
-/// \tparam MIN is the starting index of the tuple
-/// \tparam Is is [0 to sizeof...(tuple elements))
-template <size_t MIN, size_t... Is>
-struct RangeBuilder<MIN, MIN, Is...> {
- typedef IndexList<Is...> type;
-};
-
-/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
-/// in this case we are recursively subtracting N by one and adding one
-/// index to Is... list until MIN==N
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elements)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-/// \brief IndexRange that returns a [MIN, MAX) index range
-/// \tparam MIN is the starting index in the tuple
-/// \tparam MAX is the size of the tuple
-template <size_t MIN, size_t MAX>
-struct IndexRange: RangeBuilder<MIN, MAX>::type {};
-
-/// append_base
-/// \brief unpacking the elements of the input tuple t and creating a new tuple
-/// by adding element a at the end of it.
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \tparam I... is the list of index from [0 to sizeof...(t))
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T, size_t... I>
-Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
- return utility::tuple::make_tuple(get<I>(t)..., a);
-}
-
-/// append
-/// \brief the deduction function for \ref append_base that automatically
-/// generate the \ref IndexRange
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T>
-Tuple<Args..., T> append(Tuple<Args...> t, T a) {
- return utility::tuple::append_base(t, a, IndexRange<0, sizeof...(Args)>());
-}
-
-/// append_base
-/// \brief This is a specialisation of \ref append_base when we want to
-/// concatenate
-/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
-/// IndexRange for each of them and create an output tuple T that contains both
-/// elements of t1 and t2.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \tparam I1... is the list of index from [0 to sizeof...(t1))
-/// \tparam I2... is the list of index from [0 to sizeof...(t2))
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
-Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
- return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
-}
-
-/// append
-/// \brief deduction function for \ref append_base when we are appending tuple
-/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
-/// automatically generated.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2>
-Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
- return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
-}
-} // tuple
-} // utility
-
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP