diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
commit | 00f32752f7d0b193c6788691c3cf0b76457a044d (patch) | |
tree | 792e46110f0751ea8802fa9d403d1472d5977ac3 /unsupported | |
parent | ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff) |
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function.
* Adding get_tensor operation which was missing in TensorContractionMapper.
* Adding the -D method missing from cmake for Disable_Skinny Contraction operation.
* Wrapping all the indices in TensorScanSycl into Scan parameter struct.
* Fixing typo in Device SYCL
* Unifying load to private register for tall/skinny no shared
* Unifying load to vector tile for tensor-vector/vector-tensor operation
* Removing all the LHS/RHS class for extracting data from global
* Removing Outputfunction from TensorContractionSkinnyNoshared.
* Combining the local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining General Tensor-Vector and VectorTensor contraction into one kernel.
* Making double buffering optional for Tensor contraction when local memory is version is used.
* Modifying benchmark to accept custom Reduction Sizes
* Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host
* Adding Test for SYCL
* Modifying SYCL CMake
Diffstat (limited to 'unsupported')
43 files changed, 6365 insertions, 4400 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 6a8dc2cd8..f8a62253c 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -15,19 +15,6 @@ #if EIGEN_HAS_CXX11 -#if defined(EIGEN_USE_SYCL) -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include <CL/sycl.hpp> -#include <iostream> -#include <map> -#include <memory> -#include <utility> -#endif - #include "../SpecialFunctions" #include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" @@ -72,7 +59,7 @@ typedef unsigned __int64 uint64_t; #include <time.h> #endif -#ifdef EIGEN_USE_THREADS +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif @@ -147,7 +134,13 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorScan.h" #include "src/Tensor/TensorTrace.h" -#include "src/Tensor/TensorSycl.h" +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif + #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h deleted file mode 100644 index 2184c94b3..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ /dev/null @@ -1,152 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorArgMaxSycl.h - * \brief: - * TensorArgMaxSycl - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP -namespace Eigen { -namespace internal { - template<typename Dims, typename XprType> - struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense> - { - typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type; - }; - - template<typename Dims, typename XprType> - struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1, - typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type> - { - typedef TensorTupleReducerDeviceOp<Dims, XprType> type; - }; - -template<typename StrideDims, typename XprType> -struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType> -{ - typedef traits<XprType> XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Index Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - - -}// end namespace internal -template<typename StrideDims, typename XprType> -class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar; - typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; - typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested; - typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind; - typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index; - typedef typename XprType::CoeffReturnType TupleType; - typedef Index CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, - const Index return_dim, - const StrideDims strides, - const Index stride_mod, const Index stride_div) - :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - Index return_dim() const { return m_return_dim; } - - EIGEN_DEVICE_FUNC - const StrideDims& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const Index& stride_mod() const { return m_stride_mod; } - - EIGEN_DEVICE_FUNC - const Index& stride_div() const { return m_stride_div; } - - protected: - typename Eigen::internal::remove_all<typename - XprType::Nested - >::type m_xpr; - const Index m_return_dim; - const StrideDims m_strides; - const Index m_stride_mod; - const Index m_stride_div; -}; - - -// Eval as rvalue -template<typename StrideDims, typename ArgType> -struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice> -{ - typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::TupleType TupleType; - typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) - : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), - m_stride_div(op.stride_div()){} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - const TupleType v = m_impl.coeff(index); - return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; - } -typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); } - -protected: - TensorEvaluator<ArgType , SyclKernelDevice> m_impl; - const Index m_return_dim; - const StrideDims m_strides; - const Index m_stride_mod; - const Index m_stride_div; -}; -} // end namespace Eigen -#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 50865d404..9ab900b4a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -545,6 +545,10 @@ class TensorContractionInputMapper EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(*this, i, j); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const { + return Base::m_tensor; + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 35f931c53..a6ca1777a 100644..100755 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -1,136 +1,1386 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. +// This file is part of Eigen, a lightweight C++ template library for linear algebra. // // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: <eigen@codeplay.com> // -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorTensorContractionsycl.h + * TensorContractionSycl.h * * \brief: - * TensorContractionsycl + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend * -*****************************************************************/ + *****************************************************************/ #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + namespace Eigen { -template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; -template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> : - public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> > { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor> +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK> +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper, + typename StorageIndex> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type + write(PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template <data_source dt, typename PacketType, typename DataScalar> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template <data_source dt, typename PacketType, typename DataScalar> +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template <bool is_internal> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType> +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template <typename StorageIndex> +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper, + typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable, + typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp> +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous; + + typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable, + PacketReturnType> + LHSBlockProperties; + + typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable, + PacketReturnType> + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; + typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr; + typedef + typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type + tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template <contraction_type, StorageIndex> + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template <StorageIndex MemSize> + struct MemHolder<contraction_type::no_local, MemSize> { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract; + MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair<StorageIndex, StorageIndex> lhs_extract_index; + const std::pair<StorageIndex, StorageIndex> rhs_extract_index; + template <contraction_type tp = contraction_tp> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr, + typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {} + + template <contraction_type tp = contraction_tp> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr, + typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr) + : compute_panel<false>(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg, + contraction_type contract_tp = contraction_tp> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type + extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) { + auto val = + read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, + InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld); + + write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC), + data_source::private_mem>(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC), + data_source::private_mem>( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template <typename InputBlockProperties, StorageIndex TileSizeDimNC> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC); + } + + template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type + sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type + sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template <contraction_type ctp = contraction_tp> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type + sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template <bool need_sync, contraction_type ctp = contraction_tp> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template <bool need_sync, contraction_type ctp = contraction_tp> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template <bool need_sync> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread( + const cl::sycl::nd_item<1> &) { + return; + } + template <bool is_internal_block> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties<StorageIndex> &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, bool &db_offset) { + // Tiling the Rhs block from global to local memory + extract_block<RHSBlockProperties, is_internal_block>( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread<contraction_tp == contraction_type::no_local>(itemID); + + // Tiling the Lhs block from global to local memory + extract_block<LHSBlockProperties, is_internal_block>( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread<contraction_tp == contraction_type::local>(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template <bool is_internal_block, typename OutPtr> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties<StorageIndex> &thread_properties, + OutPtr out_ptr) { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store<is_internal_block, + contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local, + contraction_type contract_tp = contraction_tp> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type + extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisable by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) { + auto val = + read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, + InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld); + write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex, + typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal> +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize; + typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType> + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel<false>(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template <bool is_internal_block, typename OutPtr> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block<VecBlockProperties, is_internal_block, KFactor, + Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary<is_internal_block>(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary<is_internal_block>( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input, + typename Local> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout, + InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write<StorageIndex, 1, data_source::local_mem>(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determins the total input data size + */ +template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper, + typename RhsMapper, typename StorageIndex, bool Vectorizable> +struct GeneralScalarContraction { + typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC + GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_res.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase<TensorEvaluator< + const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> { static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value, "SYCL tensor contraction does not support output kernels."); - typedef const Eigen::SyclDevice Device; + typedef Eigen::SyclDevice Device; typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self; typedef TensorContractionEvaluatorBase<Self> Base; typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType; typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; + typedef typename XprType::Index StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; enum { Layout = TensorEvaluator<LeftArgType, Device>::Layout, + PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), + BlockAccess = false, }; - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + static EIGEN_CONSTEXPR int LDims = Base::LDims; + static EIGEN_CONSTEXPR int RDims = Base::RDims; + static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; + typedef array<StorageIndex, LDims> left_dim_mapper_t; + typedef array<StorageIndex, RDims> right_dim_mapper_t; - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; + typedef array<StorageIndex, ContractDims> contract_t; + typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t; + typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t; static const int NumDims = LDims + RDims - 2 * ContractDims; - typedef DSizes<Index, NumDims> Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef DSizes<StorageIndex, NumDims> Dimensions; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator; + typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar; + typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar; typedef typename LeftEvaluator::Dimensions LeftDimensions; typedef typename RightEvaluator::Dimensions RightDimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; + if (!data) { + this->m_result = this->m_device.get( + static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; } + evalToSycl(data); + return (this->m_result != NULL); } - const Eigen::SyclDevice& device() const {return this->m_device;} - void evalTo(Scalar* buffer) const { - // Here is the result + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { if (this->m_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { evalTyped<true, true, true, Unaligned>(buffer); - } - else { + } else { evalTyped<true, true, false, Unaligned>(buffer); } - } - else { - if (this->m_rhs_inner_dim_reordered) { + } else { + if (this->m_rhs_inner_dim_reordered) { evalTyped<true, false, true, Unaligned>(buffer); - } - else { + } else { evalTyped<true, false, false, Unaligned>(buffer); } } - } - else { + } else { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { evalTyped<false, true, true, Unaligned>(buffer); - } - else { + } else { evalTyped<false, true, false, Unaligned>(buffer); } - } - else { - if (this->m_rhs_inner_dim_reordered) { + } else { + if (this->m_rhs_inner_dim_reordered) { evalTyped<false, false, true, Unaligned>(buffer); - } - else { + } else { evalTyped<false, false, false, Unaligned>(buffer); } } @@ -138,267 +1388,263 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - // rows in left side - const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator, + right_nocontract_t, contract_t, + PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim); + } } - // required by sycl to construct the expr on the device. Returns original left_impl - const TensorEvaluator<LeftArgType, Device>& left_impl() const { - return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl); + + template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper> + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters; + launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters; + launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>( + buffer, lhs, rhs, triple_dim); + } +#endif } - // required by sycl to construct the expr on the device. Returns original right_impl - const TensorEvaluator<RightArgType, Device>& right_impl() const { - return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); + + template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties, + typename Properties, typename LhsMapper, typename RhsMapper> + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, + LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim, + PacketAccess, input_mapper_properties, true, ct> + ContractKernelName; + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + } else { + typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, + LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim, + PacketAccess, input_mapper_properties, false, ct> + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim); + + typedef Eigen::internal::SumReducer<CoeffReturnType> Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType, + EvaluatorPointerType, Op> + ReductionKernel; + + device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); + + device().deallocate_temp(temp_pointer); + } } -}; -template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, -typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, -typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN, -typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ - typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr; - LHSFunctorExpr lhs_functors; - RHSFunctorExpr rhs_functors; - LhsLocalAcc localLhs; - RhsLocalAcc localRhs; - OutAccessor out_res; - size_t out_offset; - Index roundUpK, M, N, K; - ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; - LeftNocontractT m_i_strides, m_left_nocontract_strides; - RightNocontractT m_j_strides, m_right_nocontract_strides; - LHSTupleType left_tuple_of_accessors; - RHSTupleType right_tuple_of_accessors; - Device dev; - - - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, - Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, - ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), - out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), - m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), - m_right_contracting_strides(m_right_contracting_strides_), - m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), - m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - - void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr; - auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors); - auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors); - typedef decltype(lhs_dev_expr.expr) LeftArgType; - typedef decltype(rhs_dev_expr.expr) RightArgType; - typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, LeftNocontractT, - ContractT, 1, - lhs_inner_dim_contiguous, - false, Unaligned, MakeGlobalPointer> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, RightNocontractT, - ContractT, 1, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), - lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), - rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); - // Matmul Kernel - // Thread identifiers - const Index mLocalThreadId = itemID.get_local(0); // Local ID row - const Index nLocalThreadId = itemID.get_local(1); // Local ID col - const Index mGroupId = itemID.get_group(0); // Work-group ID row - const Index nGroupId = itemID.get_group(1); // Work-group ID localCol - const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - LhsScalar privateLhs; - RhsScalar privateRhs[WorkLoadPerThreadN]; - OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0); - } - } +#ifndef EIGEN_SYCL_DISABLE_GEMV + template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex> + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor> + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper, + TensorMapper, StorageIndex, Properties, CFactor, false, + is_lhs_vec, false> + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - // Tile Lhs - for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localLhsRow = localLhsLinearId% TileSizeDimM; - Index localLhsCol = localLhsLinearId/TileSizeDimM; - // Load the value (wide vector load) - Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol; - localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0); - } - // Tile Rhs - for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localRhsRow = localRhsLinearId% TileSizeDimN; - Index localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; - localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0); + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - } - // Loop over all tiles - const Index numTiles = roundUpK/TileSizeDimK; - Index firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - Index nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localLhsRow = localLhsLinearId% TileSizeDimM; - Index localLhsCol = localLhsLinearId/TileSizeDimM; - // global K id - Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; - // Store the loaded value into local memory - localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0); - } - // Tile B - for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localRhsRow = localRhsLinearId% TileSizeDimN; - Index localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; - // Store the loaded vector into local memory - localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0); - } - } - // Loop over the values of a single tile - for (Index k=0; k<TileSizeDimK; k++) { - // Cache the values of localRhs in registers - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; - privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; - } - // Perform the computation - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; - privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; - } - } - } - // Next tile - firstHalf++; - } while (firstHalf<numTiles); - - // Store the final results in C - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; - if (globalRow< M){ - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; - if(globalCol<N) - out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN]; - } - } - } + typedef Eigen::internal::SumReducer<CoeffReturnType> Op; + typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType, + EvaluatorPointerType, Op> + ReductionKernel; + device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups); + + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper, + TensorMapper, StorageIndex, Properties, CFactor, false, + is_lhs_vec, true> + ContractKernelName; + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); } + } +#endif -}; -template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { - -static const Index TileSizeDimM = 32ul; // Tile size for dimension M -static const Index TileSizeDimN = 32ul; // Tile size for dimension N -static const Index TileSizeDimK = 16ul; // Tile size for dimension K -static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M -static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N -static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression - -// RoundUp function to make sure that the global threadId is divisable by local threadId -static Index RoundUp(Index x, Index y) { - return ((((x) + (y) - 1) / (y))*(y)); -} +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template <typename LhsMapper, typename RhsMapper> + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; -template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, - ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, - LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - - typedef typename Self::XprType HostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; - typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr; - typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr; - // extract lhs functor list - LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl()); - - Index roundUpK = RoundUp(K, TileSizeDimK); - Index roundUpM = RoundUp(M, TileSizeDimM); - Index roundUpN = RoundUp(N, TileSizeDimN); - ptrdiff_t out_offset = self.device().get_offset(buffer); - self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType; - // create lhs tuple of accessors - LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl()); - // create rhs tuple of accessors - RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl()); - - // Local memory for elements of Lhs - typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc; - LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); - // Local memory for elements of Rhs - typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; - RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor; - //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer); - // sycl parallel for - cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), - cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, - RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); - }); - self.device().asynchronousExec(); + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K); + typedef Eigen::internal::SumReducer<CoeffReturnType> Op; + typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType, + EvaluatorPointerType, StorageIndex, local_range> + GenericRKernel; + device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); + + device().deallocate_temp(temp_pointer); + } else { + device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range, + local_range, K); + } } -}; +#endif -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } + // The placeholder accessors must bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + this->m_leftImpl.bind(cgh); + this->m_rightImpl.bind(cgh); + this->m_result.bind(cgh); + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 5c94165d1..0218727d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -18,207 +18,252 @@ namespace Eigen { /** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel1D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize, range_x, range_y; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, + typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim> +struct EigenConvolutionKernel; +template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, + typename Kernel_accessor, typename Buffer_accessor> +struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, + Buffer_accessor, convolution_type::CONV1D> { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template <typename BooleanDim2> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + return (boolean_check[0] && boolean_check[1]); + } void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; - const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); /// fill the shared memory - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset ; - const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); - if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); } itemID.barrier(cl::sycl::access::fence_space::local_space); - // calculate the convolution - const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { CoeffReturnType result = static_cast<CoeffReturnType>(0); - const size_t index = plane_kernel_offset+ itemID.get_local(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); for (size_t k = 0; k < kernelSize; ++k) { result += (local_acc[k + index] * kernel_ptr[k]); } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; } } }; - -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel2D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} +template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, + typename Kernel_accessor, typename Buffer_accessor> +struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, + Buffer_accessor, convolution_type::CONV2D> { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template <typename BooleanDim3> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); - const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; - - /// fill the shared memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && + in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } } - } itemID.barrier(cl::sycl::access::fence_space::local_space); - // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { CoeffReturnType result = static_cast<CoeffReturnType>(0); - for (size_t j = 0; j < kernelSize_y; j++) { - size_t kernel_offset =kernelSize_x * j; - const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); - for (size_t i = 0; i < kernelSize_x; i++) { - result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); } } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; } } }; +template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, + typename Kernel_accessor, typename Buffer_accessor> +struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, + Buffer_accessor, convolution_type::CONV3D> { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template <typename BooleanDim3> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel3D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , - const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), - kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; - for(size_t p=0; p<numP; p++){ + for (size_t p = 0; p < numP; p++) { /// fill the shared memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) { - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + (num_x_input * (j + (num_y_input * k))); - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); } } } itemID.barrier(cl::sycl::access::fence_space::local_space); // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y - const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + if (boundary_check(itemID.get_global_id() < input_range)) { CoeffReturnType result = static_cast<CoeffReturnType>(0); - for (size_t k = 0; k < kernelSize_z; k++) { - for (size_t j = 0; j < kernelSize_y; j++) { - for (size_t i = 0; i < kernelSize_x; i++) { - const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); - const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + result += (local_acc[local_index] * kernel_ptr[kernel_index]); } } } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; } itemID.barrier(cl::sycl::access::fence_space::local_space); @@ -226,25 +271,32 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter } }; - -template<typename Indices, typename InputArgType, typename KernelArgType> -struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice> -{ +template <typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> { typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value; + static const int NumDims = + internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value; static const int NumKernelDims = internal::array_size<Indices>::value; typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; - typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions; + typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions; typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage; enum { - IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, + IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned & + TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned, PacketAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, - Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, + Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout, CoordAccess = false, // to be implemented RawAccess = false }; @@ -253,13 +305,22 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef internal::TensorBlockNotImplemented TensorBlockV2; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) == + static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); m_dimensions = m_inputImpl.dimensions(); for (int i = 0; i < NumKernelDims; ++i) { @@ -271,21 +332,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { preloadKernel(); m_inputImpl.evalSubExprsIfNeeded(NULL); if (data) { executeEval(data); return false; } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); executeEval(m_buf); return true; } @@ -294,194 +351,194 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_inputImpl.cleanup(); if (m_buf) { - m_device.deallocate(m_buf); + m_device.deallocate_temp(m_buf); m_buf = NULL; } if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); + m_device.deallocate_temp(m_kernel); m_local_kernel = false; } m_kernel = NULL; } /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); + typename KernelStorage::Type in_place = m_kernelImpl.data(); if (in_place) { m_kernel = in_place; m_local_kernel = false; } else { ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); typedef TensorEvalToOp<const KernelArgType> EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value; - internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value; + internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); m_kernel = local; m_local_kernel = true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { - typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator; typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array<size_t, 2>{numX, numP}; + auto global_range = cl::sycl::range<2>{}; + auto local_range = cl::sycl::range<2>{}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); + const array<Index, 1> indices{{m_indices[0]}}; + const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, + typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D> + ConvKernel; + + m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + break; + } - typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; - // extract input functor list - InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); - ptrdiff_t out_offset = m_device.get_offset(data); - - - m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { - - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc; - /// work-around for gcc 4.8 auto bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType; - // create input tuple of accessors - InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); - - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; - KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); - - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; - m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); - const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); - gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range - auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array<Index, 1> indices{{m_indices[0]}}; - const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), - EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 2: { - const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; - const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numP = dimensions().TotalSize() / (numX*numY); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; - gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}}; - const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; - internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } + case 2: { + auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1, + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array<size_t, 3>{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); + const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array<Index, 2> kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, + typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D> + ConvKernel; + m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + break; + } - case 3: { - const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; - const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; - const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numZ = dimensions()[m_indices[idxZ]]; - const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); - const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; - const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; - internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); - gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } + case 3: { + auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2, + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1, + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array<size_t, 3>{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array<Index, 3> indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, + typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D> + ConvKernel; + m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + break; + } - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); } - }); - m_device.asynchronousExec(); + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); eigen_assert(index < m_dimensions.TotalSize()); return m_buf[index]; } - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); + template <int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost // model. const double kernel_size = m_kernelImpl.dimensions().TotalSize(); // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); const double firstIndex_compute_cost = NumDims * - (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_kernelImpl.bind(cgh); + m_inputImpl.bind(cgh); + m_buf.bind(cgh); + m_kernel.bind(cgh); } private: // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl; + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl; KernelArgType m_kernelArg; - TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl; + TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl; Indices m_indices; Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; bool m_local_kernel; - const Eigen::SyclDevice& m_device; -}; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen -} // end namespace Eigen +} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 6f8b6f193..df591c21d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,7 +16,6 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H #include <unordered_set> - namespace Eigen { namespace TensorSycl { @@ -70,9 +69,9 @@ struct SyclDeviceInfo { } // end namespace TensorSycl typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; -// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and -// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently -// TensorFlow via the Eigen SYCL Backend. +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) { #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR @@ -421,6 +420,91 @@ class QueueInterface { return pMapper.get_offset(ptr); } + template <typename OutScalar, typename sycl_kernel, typename Lhs, + typename Rhs, typename OutPtr, typename Range, typename Index, + typename... T> + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor<OutScalar, 1, + cl::sycl::access::mode::read_write, + cl::sycl::access::target::local> + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel<sycl_kernel>(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template <typename OutScalar, typename sycl_kernel, typename InPtr, + typename OutPtr, typename Range, typename Index, typename... T> + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor<OutScalar, 1, + cl::sycl::access::mode::read_write, + cl::sycl::access::target::local> + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel<sycl_kernel>(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template <typename OutScalar, typename sycl_kernel, typename InPtr, + typename Range, typename Index, typename... T> + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor<OutScalar, 1, + cl::sycl::access::mode::read_write, + cl::sycl::access::target::local> + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel<sycl_kernel>(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + EIGEN_STRONG_INLINE void synchronize() const { #ifdef EIGEN_EXCEPTIONS m_queue.wait_and_throw(); @@ -429,6 +513,7 @@ class QueueInterface { #endif } + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { set_latest_event(e); #ifndef EIGEN_SYCL_ASYNC_EXECUTION @@ -457,11 +542,10 @@ class QueueInterface { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array<Index, 2> input_range = input_dim; Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -469,26 +553,28 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast<Index>(max_workgroup_Size)); Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize1 = + local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast<Index>(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast<Index>(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast<Index>(local_range[1] - xMode); } - tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast<Index>(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast<Index>(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast<Index>(local_range[0] - xMode); } } @@ -496,9 +582,9 @@ class QueueInterface { /// threads per block for sycl kernels template <typename Index> EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { + const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array<Index, 3> input_range = input_dim; Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -506,45 +592,48 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast<Index>(max_workgroup_Size)); Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize2 = + local_range[2] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3))); - rng2 = dim2; - if (rng2 == 0) rng1 = static_cast<Index>(1); - GRange2 = rng2; - if (tileSize2 > GRange2) - tileSize2 = GRange2; - else if (GRange2 > tileSize2) { - Index xMode = static_cast<Index>(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast<Index>(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast<Index>(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast<Index>(local_range[2] - xMode); } pow_of_2 = static_cast<Index>( - std::log2(static_cast<Index>(max_workgroup_Size / tileSize2))); - tileSize1 = + std::log2(static_cast<Index>(max_workgroup_Size / local_range[2]))); + local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast<Index>(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast<Index>(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast<Index>(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast<Index>(local_range[1] - xMode); } - tileSize0 = - static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2)); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast<Index>(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + local_range[0] = static_cast<Index>(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast<Index>(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast<Index>(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast<Index>(local_range[0] - xMode); } } EIGEN_STRONG_INLINE bool has_local_memory() const { -#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) return false; #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) return true; @@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template <typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { - queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, - rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template <typename Index> EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { - queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, - tileSize2, rng0, rng1, rng2, GRange0, - GRange1, GRange2); + const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// allocate device memory @@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); } + template <typename OutScalar, typename KernelType, typename... T> + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>( + var...); + } + template <typename OutScalar, typename KernelType, typename... T> + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>( + var...); + } + + template <typename OutScalar, typename KernelType, typename... T> + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>( + var...); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 9926046b9..b83174ab7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til // SYCL Executor policy #ifdef EIGEN_USE_SYCL -template <bool Vectorizable, typename Evaluator> -struct ExecExprFunctorKernel_impl { +template <typename Evaluator> +struct ExecExprFunctorKernel { typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + const Index range; + template <typename Scratch> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template <bool is_vec = Evaluator::PacketAccess> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type + compute(const cl::sycl::nd_item<1>& itemID) { Index gId = static_cast<Index>(itemID.get_global_linear_id()); Index total_threads = itemID.get_global_range(0); - EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { evaluator.evalScalar(i); } } -}; - -template <typename Evaluator> -struct ExecExprFunctorKernel_impl<true, Evaluator> { - typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; - Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + template <bool is_vec = Evaluator::PacketAccess> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; Index gId = static_cast<Index>(itemID.get_global_linear_id()); - if (gId < vectorizable_threads) { - const Index PacketSize = Eigen::internal::unpacket_traits< - typename Evaluator::PacketReturnType>::size; - evaluator.evalPacket(gId * PacketSize); - gId += (vectorizable_threads * PacketSize); - EIGEN_UNROLL_LOOP - for (Index i = gId; i < range; i += vectorizable_threads) { - evaluator.evalScalar(i); - } + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); } } }; -template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator> -struct ExecExprFunctorKernel - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value, - Evaluator> { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value, - Evaluator>(range_, vectorizable_threads_, evaluator) {} -}; - -template <typename Expr, typename Evaluator> -struct ExecExprFunctorKernel<Expr, false, Evaluator> - : ExecExprFunctorKernel_impl<false, Evaluator> { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl<false, Evaluator>( - range_, vectorizable_threads_, evaluator) {} -}; - template <typename Expression, bool Vectorizable, TiledEvaluation Tiling> class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> { - public: + public: typedef typename Expression::Index Index; - static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { - Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { Index range, GRange, tileSize; Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); total_size = (total_size == 0) ? 1 : total_size; - const int PacketSize = Eigen::PacketType< - typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType, - Eigen::SyclDevice>::size; - Index vectorizable_threads = - static_cast<Index>(total_size / PacketSize); + const int PacketSize = + Eigen::PacketType<typename Evaluator::CoeffReturnType, + Eigen::SyclDevice>::size; + Index vectorizable_threads = static_cast<Index>(total_size / PacketSize); dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); range = total_size; - auto f = [&](cl::sycl::handler &cgh) { - evaluator.bind(cgh); - typedef ExecExprFunctorKernel<Expression, true, - Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>> - conditional_vectorized_kernel; - - typedef ExecExprFunctorKernel<Expression, false, - Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>> - non_vectorized_kernel; -// This is to make sure that an expression with a size less than vectorized size -// will not call the vectorized kernel. -// The reason for having this kernel is that the vectorisable parameter is a -// compile-time parameter, -// however, the size of a tensor is a run-time parameter - (vectorizable_threads) - ? cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel<vectorized_kernel>(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - conditional_vectorized_kernel(range, vectorizable_threads, - evaluator)) - : cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel<non_vectorized_kernel>(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - non_vectorized_kernel(range, vectorizable_threads, - evaluator)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); - dev.async_synchronize(e); + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel<Evaluator> >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 389d5d906..b115e502b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -123,7 +123,7 @@ struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {}; namespace TensorSycl { namespace internal{ -template <typename Evaluator, typename Op> class ReductionFunctor; +template <typename Evaluator, typename Op> class GenericNondeterministicReducer; } } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 700337539..d3628f94e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -421,7 +421,7 @@ template <typename Index, typename Device, bool BlockAccess> struct MemcpyTrigge #ifdef EIGEN_USE_GPU template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif @@ -430,7 +430,7 @@ template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index #ifdef EIGEN_USE_SYCL template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 84604cf41..0bb1e643e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -946,7 +946,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M #endif #if defined(EIGEN_USE_SYCL) - template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor; + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer template <typename, typename, typename> friend struct internal::GenericReducer; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index a379f5a94..387c3edf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -11,167 +11,576 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclPlaceHolderExpr.h + * TensorReductionSycl.h * * \brief: - * This is the specialisation of the placeholder expression based on the - * operation type + * This is the specialization of the reduction operation. Two phase reduction approach + * is used since the GPU does not have Global Synchronization for global memory among + * different work-group/thread block. To solve the problem, we need to create two kernels + * to reduce the data, where the first kernel reduce the data locally and each local + * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) + * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. + * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: + * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf * -*****************************************************************/ + *****************************************************************/ #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP - namespace Eigen { +namespace TensorSycl { namespace internal { -template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ -template<typename BufferTOut, typename BufferTIn> -static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - do { - auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { - cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, - cl::sycl::range<1>{std::min(length, local)}}; - /* Two accessors are used: one to the buffer that is being reduced, - * and a second to local memory, used to store intermediate data. */ - auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h); - typedef decltype(aI) InputAccessor; - typedef decltype(aOut) OutputAccessor; - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; - LocalAccessor scratch(cl::sycl::range<1>(local), h); - - /* The parallel_for invocation chosen is the variant with an nd_item - * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local)); - }; - dev.sycl_queue().submit(f); - dev.asynchronousExec(); - - /* At this point, you could queue::wait_and_throw() to ensure that - * errors are caught quickly. However, this would likely impact - * performance negatively. */ - length = length / local; - - } while (length > 1); -} +template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable> +struct OpDefiner { + typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType; + typedef Op type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &) { + return accumulator; + } }; -template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ -template<typename BufferTOut, typename BufferTIn> -static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), - bufOut, out_offset, bufI, dev, length, local); -} +template <typename CoeffReturnType, typename Index> +struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> { + typedef Eigen::internal::SumReducer<CoeffReturnType> type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, + const Index &scale) { + ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op; + return quotient_op(accumulator, CoeffReturnType(scale)); + } }; -/// Self is useless here because in expression construction we are going to treat reduction as a leafnode. -/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the -/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as -// a leafNode. +template <typename CoeffReturnType, typename Index> +struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> { + typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType; + typedef Eigen::internal::SumReducer<CoeffReturnType> type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) { + return type(); + } -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &scale) { + return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale))); + } +}; - typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; - - static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); - int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. - size_t inputSize =self.impl().dimensions().TotalSize(); - size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input - size_t remaining = inputSize% red_factor; - if(rng ==0) { - red_factor=1; - }; - size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; - size_t GRange=std::max((size_t )1, rng); - - // convert global range to power of 2 for redecution - GRange--; - GRange |= GRange >> 1; - GRange |= GRange >> 2; - GRange |= GRange >> 4; - GRange |= GRange >> 8; - GRange |= GRange >> 16; -#if __x86_64__ || __ppc64__ || _WIN64 - GRange |= GRange >> 32; -#endif - GRange++; - size_t outTileSize = tileSize; - /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. - if (GRange < outTileSize) outTileSize=GRange; - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange)); - typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; - // Dims dims= self.xprDims(); - //Op functor = reducer; - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a workaround for gcc 4.8 bug - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; - // create a tuple of accessors from Evaluator - TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(tmp_global_accessor) OutAccessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), - TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType> - (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); - }); - dev.asynchronousExec(); - - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - ptrdiff_t out_offset = dev.get_offset(output); - /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); +template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index, + Index local_range> +struct SecondStepFullReducer { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + LocalAccessor; + typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef; + typedef typename OpDef::type Op; + LocalAccessor scratch; + InputAccessor aI; + OutputAccessor outAcc; + Op op; + SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) + : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { + // Our empirical research shows that the best performance will be achieved + // when there is only one element per thread to reduce in the second step. + // in this step the second step reduction time is almost negligible. + // Hence, in the second step of reduction the input size is fixed to the + // local size, thus, there is only one element read per thread. The + // algorithm must be changed if the number of reduce per thread in the + // second step is greater than 1. Otherwise, the result will be wrong. + const Index localid = itemID.get_local_id(0); + auto aInPtr = aI.get_pointer() + localid; + auto aOutPtr = outAcc.get_pointer(); + CoeffReturnType *scratchptr = scratch.get_pointer(); + CoeffReturnType accumulator = *aInPtr; + + scratchptr[localid] = op.finalize(accumulator); +#pragma unroll 8 + for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratchptr[localid + offset], &accumulator); + scratchptr[localid] = op.finalize(accumulator); + } + } + if (localid == 0) *aOutPtr = op.finalize(accumulator); + } +}; + +// Full reduction first phase. In this version the vectorization is true and the reduction accept +// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). +template <typename Evaluator, typename OpType, typename Evaluator::Index local_range> +class FullReductionKernelFunctor { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::Index Index; + typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index, + (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> + OpDef; + + typedef typename OpDef::type Op; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::PacketReturnType PacketReturnType; + typedef + typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), + PacketReturnType, CoeffReturnType>::type OutType; + typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + LocalAccessor; + LocalAccessor scratch; + Evaluator evaluator; + EvaluatorPointerType final_output; + Index rng; + Op op; + + FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, + Index rng_, OpType op_) + : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } + + template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + Index step = Evaluator::PacketSize * itemID.get_global_range(0); + Index start = Evaluator::PacketSize * globalid; + // vectorizable parts + PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>(); +#pragma unroll(8 / Evaluator::PacketSize) + for (Index i = start; i < VectorizedRange; i += step) { + op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator); + } + globalid += VectorizedRange; + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.template reducePacket<PacketReturnType>( + ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type( + evaluator.impl().coeff(i), op.initialize()), + &packetAccumulator); + } + scratch[localid] = packetAccumulator = + OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng); + // reduction parts // Local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator); + scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = + op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator)); + } } + template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + // vectorizable parts + CoeffReturnType accumulator = op.initialize(); + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.reduce(evaluator.impl().coeff(i), &accumulator); + } + scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); + + // reduction parts. the local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratch[localid + offset], &accumulator); + scratch[localid] = op.finalize(accumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = op.finalize(accumulator); + } + } }; +template <typename Evaluator, typename OpType> +class GenericNondeterministicReducer { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; + typedef typename OpDef::type Op; + template <typename Scratch> + GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, + Index range_, Index num_values_to_reduce_) + : evaluator(evaluator_), + output_accessor(output_accessor_), + functor(OpDef::get_op(functor_)), + range(range_), + num_values_to_reduce(num_values_to_reduce_) {} -template <typename Self, typename Op> -struct InnerReducer<Self, Op, const Eigen::SyclDevice> { + void operator()(cl::sycl::nd_item<1> itemID) { + auto output_accessor_ptr = output_accessor.get_pointer(); + /// const cast added as a naive solution to solve the qualifier drop error + Index globalid = static_cast<Index>(itemID.get_global_linear_id()); + if (globalid < range) { + CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce( + evaluator, evaluator.firstInput(globalid), functor, &accum); + output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); + } + } + + private: + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op functor; + Index range; + Index num_values_to_reduce; +}; + +enum class reduction_dim { inner_most, outer_most }; +// default is preserver +template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt> +struct PartialReductionKernel { + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + ScratchAcc; + ScratchAcc scratch; + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op op; + const Index preserve_elements_num_groups; + const Index reduce_elements_num_groups; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, + const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, + const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) + : scratch(scratch_), + evaluator(evaluator_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + preserve_elements_num_groups(preserve_elements_num_groups_), + reduce_elements_num_groups(reduce_elements_num_groups_), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, + CoeffReturnType &accumulator) { + if (globalPId >= num_coeffs_to_preserve) { + return; + } + Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) + : globalRId + (globalPId * num_coeffs_to_reduce); + Index localOffset = globalRId; + + const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; + const Index per_thread_global_stride = + rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; +#pragma unroll 8 + for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { + op.reduce(evaluator.impl().coeff(global_offset), &accumulator); + localOffset += per_thread_local_stride; + global_offset += per_thread_global_stride; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index linearLocalThreadId = itemID.get_local_id(0); + Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP + : linearLocalThreadId / PannelParameters::LocalThreadSizeR; + Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP + : linearLocalThreadId % PannelParameters::LocalThreadSizeR; + const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups + : itemID.get_group(0) / reduce_elements_num_groups; + const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups + : itemID.get_group(0) % reduce_elements_num_groups; + + Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; + auto scratchPtr = scratch.get_pointer().get(); + auto outPtr = + output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + CoeffReturnType accumulator = op.initialize(); + + element_wise_reduce(globalRId, globalPId, accumulator); + + accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); + scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = + accumulator; + if (rt == reduction_dim::inner_most) { + pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; + rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; + globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + } + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + auto out_scratch_ptr = + scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (rt == reduction_dim::inner_most) { + accumulator = *out_scratch_ptr; + } + // The Local LocalThreadSizeR is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { + if (rLocalThreadId < offset) { + op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); + // The result has already been divided for mean reducer in the + // previous reduction so no need to divide furthermore + *out_scratch_ptr = op.finalize(accumulator); + } + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { + outPtr[globalPId] = op.finalize(accumulator); + } + } +}; + +template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType> +struct SecondStepPartialReduction { + typedef OpDefiner<OpType, OutScalar, Index, false> OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + ScratchAccessor; + InputAccessor input_accessor; + OutputAccessor output_accessor; + Op op; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, + OutputAccessor output_accessor_, OpType op_, + const Index num_coeffs_to_preserve_, + const Index num_coeffs_to_reduce_) + : input_accessor(input_accessor_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index globalId = itemID.get_global_id(0); + + if (globalId >= num_coeffs_to_preserve) return; + + auto in_ptr = input_accessor.get_pointer() + globalId; + + OutScalar accumulator = op.initialize(); +// num_coeffs_to_reduce is not bigger that 256 +#pragma unroll 8 + for (Index i = 0; i < num_coeffs_to_reduce; i++) { + op.reduce(*in_ptr, &accumulator); + in_ptr += num_coeffs_to_preserve; + } + output_accessor.get_pointer()[globalId] = op.finalize(accumulator); + } +}; // namespace internal + +template <typename Index, Index LTP, Index LTR, bool BC_> +struct ReductionPannel { + static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; + static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; + static EIGEN_CONSTEXPR bool BC = BC_; +}; + +template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt> +struct PartialReducerLauncher { + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; + typedef typename Self::Storage Storage; + typedef typename Self::Index Index; + typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true> + PannelParameters; + + typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, + Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { + Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); + + // getPowerOfTwo makes sure local range is power of 2 and <= + // maxSyclThreadPerBlock this will help us to avoid extra check on the + // kernel + static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & + (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + + EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + // In this step, we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 64 + // elemnts individually, we get better performance. However, this can change + // on different platforms. In this step we force the code not to be + // morthan step reduction: Our empirical research shows that for inner_most + // dim reducer, it is better to have 8 group in a reduce dimension for sizes + // > 1024 to achieve the best performance. + const Index reductionPerThread = 64; + Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); + const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; + Index rGroups = (cu + pNumGroups - 1) / pNumGroups; + const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; + const Index globalRange = pNumGroups * rNumGroups * localRange; + + EIGEN_CONSTEXPR Index scratchSize = + PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (rNumGroups > 1) { + CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>( + dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType temp_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( + self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + + typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op> + SecondStepPartialReductionKernel; + + dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>( + temp_accessor, output, + cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), + reducer, num_coeffs_to_preserve, rNumGroups); + + self.device().deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>( + self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + } + return false; + } +}; +} // namespace internal +} // namespace TensorSycl + +namespace internal { - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> { + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { + typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType; + static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + typename Self::Index inputSize = self.impl().dimensions().TotalSize(); + // In this step we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 512 + // elemnts individually, we get better performance. + const Index reductionPerThread = 2048; + // const Index num_work_group = + Index reductionGroup = dev.getPowerOfTwo( + (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); + const Index num_work_group = std::min(reductionGroup, local_range); + // 1 + // ? local_range + // : 1); + const Index global_range = num_work_group * local_range; + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t; + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); + typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range, + local_range, inputSize, reducer); + + typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType, + EvaluatorPointerType, Index, local_range> + GenericRKernel; + dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>( + tmp_global_accessor, data, + cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, + reducer); + + dev.deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize, + reducer); + } + } +}; +// vectorizable inner_most most dim preserver +// col reduction +template <typename Self, typename Op> +struct OuterReducer<Self, Op, Eigen::SyclDevice> { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; +// row reduction +template <typename Self, typename Op> +struct InnerReducer<Self, Op, Eigen::SyclDevice> { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; + +// ArmgMax uses this kernel for partial reduction// +// TODO(@mehdi.goli) come up with a better kernel +// generic partial reduction +template <typename Self, typename Op> +struct GenericReducer<Self, Op, Eigen::SyclDevice> { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, + typename Self::Index num_coeffs_to_preserve) { typename Self::Index range, GRange, tileSize; - typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; - - // getting final out buffer at the moment the created buffer is true because there is no need for assign - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is workaround for gcc 4.8 bug. - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; - // create a tuple of accessors from Evaluator - Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output); - ptrdiff_t out_offset = dev.get_offset(output); - Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); - - }); - dev.asynchronousExec(); + dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); + + dev.template unary_kernel_launcher<typename Self::CoeffReturnType, + TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>( + self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)); return false; } }; -} // end namespace internal +} // namespace internal } // namespace Eigen #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h new file mode 100644 index 000000000..0078692cd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -0,0 +1,512 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorScanSycl.h + * + * \brief: + * Tensor Scan Sycl implement the extend version of + * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. + * The algorithm requires up to 3 stage (consequently 3 kernels) depending on + * the size of the tensor. In the first kernel (ScanKernelFunctor), each + * threads within the work-group individually reduces the allocated elements per + * thread in order to reduces the total number of blocks. In the next step all + * thread within the work-group will reduce the associated blocks into the + * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary + * buffer is given as an input and all the threads within a work-group scan and + * reduces the boundaries between the blocks (generated from the previous + * kernel). and write the data on the temporary buffer. If the second kernel is + * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will + * adjust the final result into the output buffer. + * The original algorithm for the parallel prefix sum can be found here: + * + * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel + * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 + *1, no. 1 (2008): 1-17. + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE +#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) +#endif + +template <typename index_t> +struct ScanParameters { + // must be power of 2 + static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + const index_t total_size; + const index_t non_scan_size; + const index_t scan_size; + const index_t non_scan_stride; + const index_t scan_stride; + const index_t panel_threads; + const index_t group_threads; + const index_t block_threads; + const index_t elements_per_group; + const index_t elements_per_block; + const index_t loop_range; + + ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, + index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, + index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) + : total_size(total_size_), + non_scan_size(non_scan_size_), + scan_size(scan_size_), + non_scan_stride(non_scan_stride_), + scan_stride(scan_stride_), + panel_threads(panel_threads_), + group_threads(group_threads_), + block_threads(block_threads_), + elements_per_group(elements_per_group_), + elements_per_block(elements_per_block_), + loop_range(loop_range_) {} +}; + +enum class scan_step { first, second }; +template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index, + scan_step stp> +struct ScanKernelFunctor { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; + + LocalAccessor scratch; + Evaluator dev_eval; + OutAccessor out_accessor; + OutAccessor temp_accessor; + const ScanParameters<Index> scanParameters; + Op accumulator; + const bool inclusive; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, + OutAccessor out_accessor_, OutAccessor temp_accessor_, + const ScanParameters<Index> scanParameters_, Op accumulator_, + const bool inclusive_) + : scratch(scratch_), + dev_eval(dev_eval_), + out_accessor(out_accessor_), + temp_accessor(temp_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_), + inclusive(inclusive_) {} + + template <scan_step sst = stp, typename Input> + typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt.coeff(global_id); + } + + template <scan_step sst = stp, typename Input> + typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt[global_id]; + } + + template <scan_step sst = stp, typename InclusiveOp> + typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp inclusive_op) { + inclusive_op(); + } + + template <scan_step sst = stp, typename InclusiveOp> + typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_accessor.get_pointer(); + auto tmp_ptr = temp_accessor.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + // we put one element per packet in scratch_mem + const Index scratch_stride = scanParameters.elements_per_block / PacketSize; + const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; + CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread]; + CoeffReturnType inclusive_scan; + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride); + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + private_scan[i] = ((((block_id * scanParameters.elements_per_block) + + (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) && + (global_id < scanParameters.total_size)) + ? read(dev_eval, global_id) + : accumulator.initialize(); + next_elements += scanParameters.scan_stride; + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1]; + } + }); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { + Index private_offset = 1; + // build sum in place up the tree + EIGEN_UNROLL_LOOP + for (Index d = PacketSize >> 1; d > 0; d >>= 1) { + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[bi] = accumulator.finalize(accum); + } + private_offset *= 2; + } + scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + private_scan[PacketSize - 1 + packetIndex]; + private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); + // traverse down tree & build scan + EIGEN_UNROLL_LOOP + for (Index d = 1; d < PacketSize; d *= 2) { + private_offset >>= 1; + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[ai] = private_scan[bi]; + private_scan[bi] = accumulator.finalize(accum); + } + } + } + + Index offset = 1; + // build sum in place up the tree + for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[bi] = accumulator.finalize(accum); + } + offset *= 2; + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // next step optimisation + if (local_id == 0) { + if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { + const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * + scanParameters.non_scan_size + + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + + block_id; + tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; + } + // clear the last element + scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + } + // traverse down tree & build scan + for (Index d = 1; d < scratch_stride; d *= 2) { + offset >>= 1; + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[ai] = scratch_ptr[bi]; + scratch_ptr[bi] = accumulator.finalize(accum); + } + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) { + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; i++) { + CoeffReturnType accum = private_scan[packetIndex + i]; + accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + private_scan[packetIndex + i] = accumulator.finalize(accum); + } + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan); + private_scan[0] = accumulator.finalize(inclusive_scan); + } + }); + next_elements = 0; + // right the first set of private param + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive)); + out_ptr[global_id] = private_scan[private_id]; + } + next_elements += scanParameters.scan_stride; + } + } // end for loop + } +}; + +template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index> +struct ScanAdjustmentKernelFunctor { + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2; + InAccessor in_accessor; + OutAccessor out_accessor; + const ScanParameters<Index> scanParameters; + Op accumulator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, + OutAccessor out_accessor_, + const ScanParameters<Index> scanParameters_, + Op accumulator_) + : in_accessor(in_accessor_), + out_accessor(out_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto in_ptr = in_accessor.get_pointer(); + auto out_ptr = out_accessor.get_pointer(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride; + + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; + const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; + CoeffReturnType adjust_val = in_ptr[in_id]; + + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + CoeffReturnType accum = adjust_val; + accumulator.reduce(out_ptr[global_id], &accum); + out_ptr[global_id] = accumulator.finalize(accum); + } + next_elements += scanParameters.scan_stride; + } + } + } +}; + +template <typename Index> +struct ScanInfo { + const Index &total_size; + const Index &scan_size; + const Index &panel_size; + const Index &non_scan_size; + const Index &scan_stride; + const Index &non_scan_stride; + + Index max_elements_per_block; + Index block_size; + Index panel_threads; + Index group_threads; + Index block_threads; + Index elements_per_group; + Index elements_per_block; + Index loop_range; + Index global_range; + Index local_range; + const Eigen::SyclDevice &dev; + EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, + const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, + const Eigen::SyclDevice &dev_) + : total_size(total_size_), + scan_size(scan_size_), + panel_size(panel_size_), + non_scan_size(non_scan_size_), + scan_stride(scan_stride_), + non_scan_stride(non_scan_stride_), + dev(dev_) { + // must be power of 2 + local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), + Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); + + max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread; + + elements_per_group = + dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true); + const Index elements_per_panel = elements_per_group * non_scan_size; + elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); + panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread; + group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread; + block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread; + block_size = elements_per_group / elements_per_block; +#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE + const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); +#else + const Index max_threads = panel_threads * panel_size; +#endif + global_range = roundUp(max_threads, local_range); + loop_range = Index( + std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread))); + } + inline ScanParameters<Index> get_scan_parameter() { + return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, + group_threads, block_threads, elements_per_group, elements_per_block, loop_range); + } + inline cl::sycl::nd_range<1> get_thread_range() { + return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + } +}; + +template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index> +struct SYCLAdjustBlockOffset { + EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, + Reducer &accumulator, const Index total_size, + const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + + typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index> + AdjustFuctor; + dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(), + scan_info.max_elements_per_block, + scan_info.get_scan_parameter(), accumulator); + } +}; + +template <typename CoeffReturnType, scan_step stp> +struct ScanLauncher_impl { + template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index> + EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, + const Index total_size, const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const bool inclusive, + const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; + const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2); + CoeffReturnType *temp_pointer = + static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + + typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor; + dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>( + in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, + scan_info.get_scan_parameter(), accumulator, inclusive); + + if (scan_info.block_size > 1) { + ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block( + tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, + non_scan_size, Index(1), scan_info.block_size, false, dev); + + SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset( + tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, + non_scan_stride, dev); + } + dev.deallocate_temp(temp_pointer); + } +}; + +} // namespace internal +} // namespace TensorSycl + +template <typename Self, typename Reducer> +struct ScanLauncher<Self, Reducer, Eigen::SyclDevice> { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + void operator()(Self &self, EvaluatorPointerType data) { + const Index total_size = internal::array_prod(self.dimensions()); + const Index scan_size = self.size(); + const Index scan_stride = self.stride(); + // this is the scan op (can be sum or ...) + auto accumulator = self.accumulator(); + auto inclusive = !self.exclusive(); + auto consume_dim = self.consume_dim(); + auto dev = self.device(); + + auto dims = self.inner().dimensions(); + + Index non_scan_size = 1; + Index panel_size = 1; + if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < consume_dim; i++) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim + 1; i < Self::NumDims; i++) { + panel_size *= dims[i]; + } + } else { + for (int i = Self::NumDims - 1; i > consume_dim; i--) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim - 1; i >= 0; i--) { + panel_size *= dims[i]; + } + } + const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; + auto eval_impl = self.inner(); + TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block( + eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, + inclusive, dev); + } +}; +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h deleted file mode 100644 index 7b8bd2df7..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ /dev/null @@ -1,120 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: eigen@codeplay.com -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// General include header of SYCL target for Tensor Module -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H - -#ifdef EIGEN_USE_SYCL - -// global pointer to set different attribute state for a class -template <class T> -struct MakeGlobalPointer { - typedef typename cl::sycl::global_ptr<T>::pointer_t Type; - typedef typename cl::sycl::global_ptr<T>::reference_t RefType; -}; - -// global pointer to set different attribute state for a class -template <class T> -struct MakeLocalPointer { - typedef typename cl::sycl::local_ptr<T>::pointer_t Type; - typedef typename cl::sycl::local_ptr<T>::reference_t RefType; -}; - - -namespace Eigen { - template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp; - template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>; -namespace internal { - -#ifdef __SYCL_DEVICE_ONLY__ -template<typename A, typename B> struct TypeConversion { - template<typename T> - static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p); - template<typename T> - static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p); - - template<typename T> - static A* get_address_space_pointer(T* p); - typedef decltype(get_address_space_pointer(B())) type; -}; - -#endif -} -namespace TensorSycl { -namespace internal { - - template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; -/// This struct is used for special expression nodes with no operations (for example assign and selectOP). - struct NoOP; - -template<bool IsConst, typename T> struct GetType{ - typedef const T Type; -}; -template<typename T> struct GetType<false, T>{ - typedef T Type; -}; - -template <bool Conds, size_t X , size_t Y > struct ValueCondition { - static constexpr size_t Res =X; -}; -template<size_t X, size_t Y> struct ValueCondition<false, X, Y> { - static constexpr size_t Res =Y; -}; - -} -} -} - -// tuple construction -#include "TensorSyclTuple.h" - -// counting number of leaf at compile time -#include "TensorSyclLeafCount.h" - -// The index PlaceHolder takes the actual expression and replaces the actual -// data on it with the place holder. It uses the same pre-order expression tree -// traverse as the leaf count in order to give the right access number to each -// node in the expression -#include "TensorSyclPlaceHolderExpr.h" - -// creation of an accessor tuple from a tuple of SYCL buffers -#include "TensorSyclExtractAccessor.h" - -// this is used to change the address space type in tensor map for GPU -#include "TensorSyclConvertToDeviceExpression.h" - -// this is used to extract the functors -#include "TensorSyclExtractFunctors.h" - -// this is used to create tensormap on the device -// this is used to construct the expression on the device -#include "TensorSyclExprConstructor.h" - -/// this is used for extracting tensor reduction -#include "TensorReductionSycl.h" - -// TensorArgMaxSycl.h -#include "TensorArgMaxSycl.h" - -/// this is used for extracting tensor convolution -#include "TensorConvolutionSycl.h" - -// kernel execution using fusion -#include "TensorSyclRun.h" -//sycl functors -#include "TensorSyclFunctors.h" - -#include "TensorContractionSycl.h" - -#endif // end of EIGEN_USE_SYCL -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h deleted file mode 100644 index d6ac7b91f..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ /dev/null @@ -1,205 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclConvertToDeviceExpression.h - * - * \brief: - * Conversion from host pointer to device pointer - * inside leaf nodes of the expression. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -/// \struct ConvertToDeviceExpression -/// \brief This struct is used to convert the MakePointer in the host expression -/// to the MakeGlobalPointer for the device expression. For the leafNodes -/// containing the pointer. This is due to the fact that the address space of -/// the pointer T* is different on the host and the device. -template <typename Expr> -struct ConvertToDeviceExpression; - -template<template<class...> class NonOpCategory, bool IsConst, typename... Args> -struct NonOpConversion{ - typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type; -}; - - -template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args> -struct DeviceConvertor{ - typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type; -}; - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorMap -#define TENSORMAPCONVERT(CVQual)\ -template <typename T, int Options_, template <class> class MakePointer_>\ -struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\ - typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ -}; - -TENSORMAPCONVERT(const) -TENSORMAPCONVERT() -#undef TENSORMAPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp -#define CATEGORYCONVERT(CVQual)\ -template <template<class, class...> class Category, typename OP, typename... subExprs>\ -struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\ - typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\ -}; -CATEGORYCONVERT(const) -CATEGORYCONVERT() -#undef CATEGORYCONVERT - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseSelectOp -#define SELECTOPCONVERT(CVQual, Res)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr>\ -struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\ -: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {}; -SELECTOPCONVERT(const, true) -SELECTOPCONVERT(, false) -#undef SELECTOPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is const AssingOP -#define ASSIGNCONVERT(CVQual, Res)\ -template <typename LHSExpr, typename RHSExpr>\ -struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\ -: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{}; - -ASSIGNCONVERT(const, true) -ASSIGNCONVERT(, false) -#undef ASSIGNCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorEvalToOp -#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ -template <typename Expr>\ -struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ -: DeviceConvertor<ExprNode, Res, Expr>{}; - - -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp -#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ -template <typename Expr>\ -struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ - typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ -}; - - -// TensorForcedEvalOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) - -// TensorLayoutSwapOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) - -//TensorIndexTupleOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) -#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp -#define KERNELBROKERCONVERTREDUCTION(CVQual)\ -template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\ -struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\ - typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\ -}; - -KERNELBROKERCONVERTREDUCTION(const) -KERNELBROKERCONVERTREDUCTION() -#undef KERNELBROKERCONVERTREDUCTION - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp -#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ -template <typename OP, typename Dim, typename subExpr>\ -struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\ - typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\ -}; - -KERNELBROKERCONVERTTUPLEREDUCTION(const) -KERNELBROKERCONVERTTUPLEREDUCTION() -#undef KERNELBROKERCONVERTTUPLEREDUCTION - -//TensorSlicingOp -#define KERNELBROKERCONVERTSLICEOP(CVQual)\ -template<typename StartIndices, typename Sizes, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; - -KERNELBROKERCONVERTSLICEOP(const) -KERNELBROKERCONVERTSLICEOP() -#undef KERNELBROKERCONVERTSLICEOP - -//TensorStridingSlicingOp -#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; - -KERNELBROKERCONVERTERSLICESTRIDEOP(const) -KERNELBROKERCONVERTERSLICESTRIDEOP() -#undef KERNELBROKERCONVERTERSLICESTRIDEOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp -#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ -template <DenseIndex DimId, typename Expr>\ -struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\ - typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\ -}; -KERNELBROKERCONVERTCHIPPINGOP(const) -KERNELBROKERCONVERTCHIPPINGOP() -#undef KERNELBROKERCONVERTCHIPPINGOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp -#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\ - typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; -KERNELBROKERCONVERTIMAGEPATCHOP(const) -KERNELBROKERCONVERTIMAGEPATCHOP() -#undef KERNELBROKERCONVERTIMAGEPATCHOP - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp -#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ -template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\ - typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; -KERNELBROKERCONVERTVOLUMEPATCHOP(const) -KERNELBROKERCONVERTVOLUMEPATCHOP() -#undef KERNELBROKERCONVERTVOLUMEPATCHOP - -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX1 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h deleted file mode 100644 index 67003daf5..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ /dev/null @@ -1,514 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclExprConstructor.h - * - * \brief: - * This file re-create an expression on the SYCL device in order - * to use the original tensor evaluator. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -template <typename Expr, typename Dims> -struct DeviceFixedSizeTensor; - -template <typename Expr, typename std::ptrdiff_t... Indices> -struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{ - template<typename Data> - static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);} -}; -/// this class is used by EvalToOp in order to create an lhs expression which is -/// a pointer from an accessor on device-only buffer -template <typename PtrType, size_t N, typename... Params> -struct EvalToLHSConstructor { - PtrType expr; - EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {} -}; - -/// struct ExprConstructor is used to reconstruct the expression on the device and -/// recreate the expression with MakeGlobalPointer containing the device address -/// space for the TensorMap pointers used in eval function. -/// It receives the original expression type, the functor of the node, the tuple -/// of accessors, and the device expression type to re-instantiate the -/// expression tree for the device -template <typename OrigExpr, typename IndexExpr, typename... Params> -struct ExprConstructor; - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAP(CVQual)\ -template <typename T, int Options_,\ -template <class> class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ -}; - -TENSORMAP(const) -TENSORMAP() -#undef TENSORMAP - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAPFIXEDSIZE(CVQual)\ -template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\ -template <class> class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ - : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ -}; - -TENSORMAPFIXEDSIZE(const) -TENSORMAPFIXEDSIZE() -#undef TENSORMAPFIXEDSIZE - -#define UNARYCATEGORY(CVQual)\ -template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ -struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\ - typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\ - my_type rhsExpr;\ - typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\ -}; - -UNARYCATEGORY(const) -UNARYCATEGORY() -#undef UNARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorBinaryOp -#define BINARYCATEGORY(CVQual)\ -template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\ -typename RHSExpr, typename... Params>\ -struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>, CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\ - typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\ - typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\ - typedef CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\ - my_left_type lhsExpr;\ - my_right_type rhsExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\ -}; - -BINARYCATEGORY(const) -BINARYCATEGORY() -#undef BINARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorCwiseTernaryOp -#define TERNARYCATEGORY(CVQual)\ -template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\ -typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\ -struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\ - typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\ - typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\ - typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\ - typedef CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\ - my_arg1_type arg1Expr;\ - my_arg2_type arg2Expr;\ - my_arg3_type arg3Expr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\ - : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\ -}; - -TERNARYCATEGORY(const) -TERNARYCATEGORY() -#undef TERNARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorCwiseSelectOp -#define SELECTOP(CVQual)\ -template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\ -struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\ - typedef ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\ - typedef ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\ - typedef ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\ - typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\ - my_if_type ifExpr;\ - my_then_type thenExpr;\ - my_else_type elseExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\ -}; - -SELECTOP(const) -SELECTOP() -#undef SELECTOP - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// const TensorAssignOp -#define ASSIGN(CVQual)\ -template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\ -struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\ - typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\ - typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\ - typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type> Type;\ - my_left_type lhsExpr;\ - my_right_type rhsExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\ - }; - - ASSIGN(const) - ASSIGN() - #undef ASSIGN - - /// specialisation of the \ref ExprConstructor struct when the node type is - /// const TensorAssignOp - #define CONVERSIONEXPRCONST(CVQual)\ - template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\ - struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\ - typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\ - typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\ - my_nested_type nestedExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ - }; - - CONVERSIONEXPRCONST(const) - CONVERSIONEXPRCONST() - #undef CONVERSIONEXPRCONST - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorEvalToOp /// 0 here is the output number in the buffer -#define EVALTO(CVQual)\ -template <typename OrigExpr, typename Expr, typename... Params>\ -struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\ - typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\ - typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\ - typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\ - my_expr_type nestedExpression;\ - EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ -}; - -EVALTO(const) -EVALTO() -#undef EVALTO - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorForcedEvalOp -#define FORCEDEVAL(CVQual)\ -template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ -CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef TensorForcedEvalOp<OrigExpr> XprType;\ - typedef CVQual TensorMap<\ - Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ - Eigen::internal::traits<XprType>::Layout, \ - MakeGlobalPointer\ - > Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ -}; - -FORCEDEVAL(const) -FORCEDEVAL() -#undef FORCEDEVAL - -#define TENSORCUSTOMUNARYOP(CVQual)\ -template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ -CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\ - typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\ - typedef CVQual TensorMap<\ - Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ - Eigen::internal::traits<XprType>::Layout, \ - MakeGlobalPointer\ - > Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ -}; - -TENSORCUSTOMUNARYOP(const) -TENSORCUSTOMUNARYOP() -#undef TENSORCUSTOMUNARYOP - -/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXPR(CVQual)\ -template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\ - static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ - typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\ - NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ -}; - -SYCLREDUCTIONEXPR(const) -SYCLREDUCTIONEXPR() -#undef SYCLREDUCTIONEXPR - -/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp -/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. -#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ -template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\ -CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\ - static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\ - static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\ -static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\ - typedef CVQual TensorMap<\ - Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\ - Layout,\ - MakeGlobalPointer\ - > XprType;\ - typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\ - static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\ - typedef array<Index, NumDims> StrideDims;\ - typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\ - fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ - }\ -}; - -SYCLTUPLEREDUCTIONEXPR(const) -SYCLTUPLEREDUCTIONEXPR() -#undef SYCLTUPLEREDUCTIONEXPR - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp -#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ -template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ -struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ -CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ - typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\ - static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\ - typedef CVQual TensorMap<\ - Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\ - Eigen::internal::traits<XprTyp>::Layout, \ - MakeGlobalPointer\ - > Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ -}; - -//TensorContractionOp -SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) -SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) -//TensorConvolutionOp -SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) -SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) -//TensorCustomBinaryOp -SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) -SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) -#undef SYCLCONTRACTCONVCUSBIOPS - -//TensorSlicingOp -#define SYCLSLICEOPEXPR(CVQual)\ -template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\ -}; - -SYCLSLICEOPEXPR(const) -SYCLSLICEOPEXPR() -#undef SYCLSLICEOPEXPR - -//TensorStridingSlicingOp -#define SYCLSLICESTRIDEOPEXPR(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\ -}; - -SYCLSLICESTRIDEOPEXPR(const) -SYCLSLICESTRIDEOPEXPR() -#undef SYCLSLICESTRIDEOPEXPR - -//TensorReshapingOp and TensorShufflingOp -#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ -template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ -}; - -// TensorReshapingOp -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) -// TensorShufflingOp -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) -#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST - -//TensorPaddingOp -#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ -template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ -}; - -//TensorPaddingOp -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) -#undef SYCLPADDINGOPEXPRCONST - -// TensorChippingOp -#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ -template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\ -}; - -SYCLTENSORCHIPPINGOPEXPR(const) -SYCLTENSORCHIPPINGOPEXPR() -#undef SYCLTENSORCHIPPINGOPEXPR - -// TensorImagePatchOp -#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ - funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \ - funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\ -}; - -SYCLTENSORIMAGEPATCHOPEXPR(const) -SYCLTENSORIMAGEPATCHOPEXPR() -#undef SYCLTENSORIMAGEPATCHOPEXPR - -// TensorVolumePatchOp -#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\ -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ - funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ - funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \ - funcD.m_padding_type, funcD.m_padding_value ){\ - }\ -}; - -SYCLTENSORVOLUMEPATCHOPEXPR(const) -SYCLTENSORVOLUMEPATCHOPEXPR() -#undef SYCLTENSORVOLUMEPATCHOPEXPR - -// TensorLayoutSwapOp and TensorIndexTupleOp -#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ -template<typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ -}; - -//TensorLayoutSwapOp -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) -//TensorIndexTupleOp -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) - -#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR - -/// template deduction for \ref ExprConstructor struct -template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> -auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t) - -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) { - return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t); -} - -} /// namespace TensorSycl -} /// namespace internal -} /// namespace Eigen - - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h deleted file mode 100644 index 3c74d1696..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ /dev/null @@ -1,310 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclExtractAccessor.h - * - * \brief: - * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl - * buffers as an input. Using pre-order tree traversal, ExtractAccessor - * recursively calls itself for its children in the expression tree. The - * leaf node in the PlaceHolder expression is nothing but a container preserving - * the order of the actual data in the tuple of sycl buffer. By invoking the - * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth - * buffer in the tuple of buffers. This accessor is then added as an Nth - * element in the tuple of accessors. In this case we preserve the order of data - * in the expression tree. - * - * This is the specialisation of extract accessor method for different operation - * type in the PlaceHolder expression. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { -#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} - -/// struct ExtractAccessor: Extract Accessor Class is used to extract the -/// accessor from a buffer. -/// Depending on the type of the leaf node we can get a read accessor or a -/// read_write accessor -template <typename Evaluator> -struct ExtractAccessor; - -struct AccessorConstructor{ - template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval)) - - template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) - - template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) - - template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))) -}; - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp -#define SYCLUNARYCATEGORYEXTACC(CVQual)\ -template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\ -RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLUNARYCATEGORYEXTACC(const) -SYCLUNARYCATEGORYEXTACC() -#undef SYCLUNARYCATEGORYEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp -#define SYCLBINARYCATEGORYEXTACC(CVQual)\ -template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ -}; - -SYCLBINARYCATEGORYEXTACC(const) -SYCLBINARYCATEGORYEXTACC() -#undef SYCLBINARYCATEGORYEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorCwiseTernaryOp -#define SYCLTERNARYCATEGORYEXTACC(CVQual)\ -template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ -}; - -SYCLTERNARYCATEGORYEXTACC(const) -SYCLTERNARYCATEGORYEXTACC() -#undef SYCLTERNARYCATEGORYEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorCwiseSelectOp. This is a special case where there is no OP -#define SYCLSELECTOPEXTACC(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ -}; - -SYCLSELECTOPEXTACC(const) -SYCLSELECTOPEXTACC() -#undef SYCLSELECTOPEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp -#define SYCLTENSORASSIGNOPEXTACC(CVQual)\ -template <typename LHSExpr, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ -}; - - SYCLTENSORASSIGNOPEXTACC(const) - SYCLTENSORASSIGNOPEXTACC() - #undef SYCLTENSORASSIGNOPEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap -#define TENSORMAPEXPR(CVQual, ACCType)\ -template <typename PlainObjectType, int Options_, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\ -}; - -TENSORMAPEXPR(const, cl::sycl::access::mode::read) -TENSORMAPEXPR(, cl::sycl::access::mode::read_write) -#undef TENSORMAPEXPR - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp -#define SYCLFORCEDEVALEXTACC(CVQual)\ -template <typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; - -SYCLFORCEDEVALEXTACC(const) -SYCLFORCEDEVALEXTACC() -#undef SYCLFORCEDEVALEXTACC - -//TensorCustomUnaryOp -#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ -template <typename CustomUnaryFunc, typename XprType, typename Dev >\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; - - -SYCLCUSTOMUNARYOPEXTACC(const) -SYCLCUSTOMUNARYOPEXTACC() -#undef SYCLCUSTOMUNARYOPEXTACC - -//TensorCustomBinaryOp -#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ -template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; - -SYCLCUSTOMBINARYOPEXTACC(const) -SYCLCUSTOMBINARYOPEXTACC() -#undef SYCLCUSTOMBIBARYOPEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp -#define SYCLEVALTOEXTACC(CVQual)\ -template <typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\ - RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ -}; - -SYCLEVALTOEXTACC(const) -SYCLEVALTOEXTACC() -#undef SYCLEVALTOEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ -template <typename OP, typename Dim, typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; -// TensorReductionOp -SYCLREDUCTIONEXTACC(const,TensorReductionOp) -SYCLREDUCTIONEXTACC(,TensorReductionOp) - -// TensorTupleReducerOp -SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) -SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) -#undef SYCLREDUCTIONEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp -#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ -template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\ - struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; -//TensorContractionOp -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) -//TensorConvolutionOp -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTIONEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. -#define SYCLSLICEOPEXTACC(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\ - RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICEOPEXTACC(const) -SYCLSLICEOPEXTACC() -#undef SYCLSLICEOPEXTACC -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorStridingSlicingOp. -#define SYCLSLICESTRIDEOPEXTACC(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICESTRIDEOPEXTACC(const) -SYCLSLICESTRIDEOPEXTACC() -#undef SYCLSLICESTRIDEOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorChippingOp. -#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\ -template<DenseIndex DimId, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLTENSORCHIPPINGOPEXTACC(const) -SYCLTENSORCHIPPINGOPEXTACC() -#undef SYCLTENSORCHIPPINGOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorImagePatchOp. -#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLTENSORIMAGEPATCHOPEXTACC(const) -SYCLTENSORIMAGEPATCHOPEXTACC() -#undef SYCLTENSORIMAGEPATCHOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorVolumePatchOp. -#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLTENSORVOLUMEPATCHOPEXTACC(const) -SYCLTENSORVOLUMEPATCHOPEXTACC() -#undef SYCLTENSORVOLUMEPATCHOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorLayoutSwapOp, TensorIndexTupleOp -#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ -template<typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -// TensorLayoutSwapOp -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) -//TensorIndexTupleOp -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) -SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) - -#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC - -/// template deduction for \ref ExtractAccessor -template <typename Evaluator> -auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval) --> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) { - return ExtractAccessor<Evaluator>::getTuple(cgh, eval); -} - -} /// namespace TensorSycl -} /// namespace internal -} /// namespace Eigen -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h deleted file mode 100644 index 09407e53e..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ /dev/null @@ -1,467 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclextractFunctors.h - * - * \brief: - * Used to extract all the functors allocated to each node of the expression -*tree. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { -/// struct FunctorExtractor: This struct is used to extract the functors -/// constructed on -/// the host-side, to pack them and reuse them in reconstruction of the -/// expression on the device. -/// We have to do that as in Eigen the functors are not stateless so we cannot -/// re-instantiate them on the device. -/// We have to pass instantiated functors to the device. -// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). -#define DEFALTACTION(Evaluator)\ -typedef typename Evaluator::Dimensions Dimensions;\ -const Dimensions m_dimensions;\ -EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ -FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} - -template <typename Evaluator> struct FunctorExtractor{ - DEFALTACTION(Evaluator) -}; - - -/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything -///TensorConversionOp -#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ -template <typename ArgType1, typename ArgType2, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\ - FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\ - : subExpr(expr.impl()) {}\ -}; - -SYCLEXTRFUNCCONVERSION(TensorConversionOp, const) -SYCLEXTRFUNCCONVERSION(TensorConversionOp, ) -#undef SYCLEXTRFUNCCONVERSION - -#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\ -template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\ -struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\ -FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\ -}; - -SYCLEXTRTENSORMAPFIXEDSIZE(const) -SYCLEXTRTENSORMAPFIXEDSIZE() -#undef SYCLEXTRTENSORMAPFIXEDSIZE - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp -#define SYCLEXTRFUNCUNARY(CVQual)\ -template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\ - : rhsExpr(expr.impl()), func(expr.functor()) {}\ -}; - -SYCLEXTRFUNCUNARY(const) -SYCLEXTRFUNCUNARY() -#undef SYCLEXTRFUNCUNARY - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseBinaryOp -#define SYCLEXTRFUNCBIINARY(CVQual)\ -template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\ -}; - -SYCLEXTRFUNCBIINARY(const) -SYCLEXTRFUNCBIINARY() -#undef SYCLEXTRFUNCBIINARY - -/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp -#define SYCLEXTRFUNCTERNARY(CVQual)\ -template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\ - FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\ - FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\ - : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\ -}; - -SYCLEXTRFUNCTERNARY(const) -SYCLEXTRFUNCTERNARY() -#undef SYCLEXTRFUNCTERNARY - - - -//TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different -//from the UnaryCategory and it is similar to the general FunctorExtractor. -/// specialisation of TensorCustomOp -#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ -template <typename CustomUnaryFunc, typename ArgType, typename Dev >\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\ - typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ - DEFALTACTION(Evaluator)\ -}; -//TensorCustomUnaryOp -SYCLEXTRFUNCCUSTOMUNARYOP(const) -SYCLEXTRFUNCCUSTOMUNARYOP() -#undef SYCLEXTRFUNCCUSTOMUNARYOP - -//TensorCustomBinaryOp -#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ -template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ - typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ - DEFALTACTION(Evaluator)\ -}; -//TensorCustomBinaryOp -SYCLEXTRFUNCCUSTOMBIBARYOP(const) -SYCLEXTRFUNCCUSTOMBIBARYOP() -#undef SYCLEXTRFUNCCUSTOMBIBARYOP - - - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCSELECTOP(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ -struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\ - FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\ - FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\ - : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\ -}; - -SYCLEXTRFUNCSELECTOP(const) -SYCLEXTRFUNCSELECTOP() -#undef SYCLEXTRFUNCSELECTOP - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// const TensorAssignOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCASSIGNOP(CVQual)\ -template <typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\ -}; -SYCLEXTRFUNCASSIGNOP(const) -SYCLEXTRFUNCASSIGNOP() -#undef SYCLEXTRFUNCASSIGNOP - -/// specialisation of the \ref FunctorExtractor struct when the node types are -/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ -template <typename Expr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\ - : xprExpr(expr.impl()) {}\ -}; -//TensorEvalToOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) -// TensorLayoutSwapOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) -// TensorIndexTupleOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) - -#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE - -template<typename Dim, size_t NumOutputDim> struct DimConstr { -template<typename InDim> - static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;} -}; - -template<typename Dim> struct DimConstr<Dim, 0> { - template<typename InDim> - static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} -}; -//TensorReductionOp -#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ -template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\ - typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ - typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ - const Dimensions m_dimensions;\ - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ - : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ -}; -SYCLEXTRFUNCREDUCTIONOP(const) -SYCLEXTRFUNCREDUCTIONOP() -#undef SYCLEXTRFUNCREDUCTIONOP - -//TensorTupleReducerOp -#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ -template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\ - struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\ - typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\ - static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\ - typedef typename Evaluator::StrideDims StrideDims;\ - typedef typename Evaluator::Index Index;\ - typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ - const Dimensions m_dimensions;\ - const Index m_return_dim;\ - const StrideDims m_strides;\ - const Index m_stride_mod;\ - const Index m_stride_div;\ - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\ - EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\ - EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\ - EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\ - FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\ - : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ - m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ -}; - -SYCLEXTRFUNCTUPLEREDUCTIONOP(const) -SYCLEXTRFUNCTUPLEREDUCTIONOP() -#undef SYCLEXTRFUNCTUPLEREDUCTIONOP - -//TensorContractionOp and TensorConvolutionOp -#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ -template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ - typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\ - typedef typename Evaluator::Dimensions Dimensions;\ - const Dimensions m_dimensions;\ - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\ - : m_dimensions(expr.dimensions()) {}\ -}; - -//TensorContractionOp -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) -//TensorConvolutionOp -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) -#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCTSLICEOP(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ - FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\ - const StartIndices m_offsets;\ - const Sizes m_dimensions;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\ - EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\ - EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\ -}; - -SYCLEXTRFUNCTSLICEOP(const) -SYCLEXTRFUNCTSLICEOP() -#undef SYCLEXTRFUNCTSLICEOP - -//TensorStridingSlicingOp -#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ - FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\ - const StartIndices m_startIndices;\ - const StopIndices m_stopIndices;\ - const Strides m_strides;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\ - EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\ - EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\ - EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\ -}; - -SYCLEXTRFUNCTSLICESTRIDEOP(const) -SYCLEXTRFUNCTSLICESTRIDEOP() -#undef SYCLEXTRFUNCTSLICESTRIDEOP - -// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory -#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ -template<typename Param, typename XprType, typename Dev>\ -struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\ - const Param m_param;\ - EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\ - FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ -}; - -//TensorReshapingOp -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) - -//TensorShufflingOp -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) -#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT - -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory -#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\ -template<typename Param, typename XprType, typename Dev>\ -struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\ - const Param m_param;\ - typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\ - const Scalar m_scalar_param;\ - EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\ - EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\ - FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\ -}; - -PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const) -PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), ) -#undef PADDINGOPFUNCEXT - -/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp -/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type. -#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\ -template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const Param func;\ - FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ -}; - -// TensorConcatenationOp -SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) -SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) -#undef SYCLEXTRFUNCCONTRACTCONCAT - -//TensorChippingOp -#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ -template<DenseIndex DimId, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ - const DenseIndex m_dim;\ - const DenseIndex m_offset;\ - EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\ - EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\ - FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\ - : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\ -}; - -SYCLEXTRFUNCCHIPPINGOP(const) -SYCLEXTRFUNCCHIPPINGOP() -#undef SYCLEXTRFUNCCHIPPINGOP - -//TensorImagePatchOp -#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\ -typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\ -FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ -const DenseIndex m_patch_rows;\ -const DenseIndex m_patch_cols;\ -const DenseIndex m_row_strides;\ -const DenseIndex m_col_strides;\ -const DenseIndex m_in_row_strides;\ -const DenseIndex m_in_col_strides;\ -const DenseIndex m_row_inflate_strides;\ -const DenseIndex m_col_inflate_strides;\ -const bool m_padding_explicit;\ -const DenseIndex m_padding_top;\ -const DenseIndex m_padding_bottom;\ -const DenseIndex m_padding_left;\ -const DenseIndex m_padding_right;\ -const PaddingType m_padding_type;\ -const typename Self::Scalar m_padding_value;\ -FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ -: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ - m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ - m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ - m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\ - m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\ - m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ - m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\ - m_padding_value(expr.xpr().padding_value()){}\ -}; - -SYCLEXTRFUNCIMAGEPATCHOP(const) -SYCLEXTRFUNCIMAGEPATCHOP() -#undef SYCLEXTRFUNCIMAGEPATCHOP - -/// TensorVolumePatchOp -#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\ -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\ -typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\ -FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ -const DenseIndex m_patch_planes;\ -const DenseIndex m_patch_rows;\ -const DenseIndex m_patch_cols;\ -const DenseIndex m_plane_strides;\ -const DenseIndex m_row_strides;\ -const DenseIndex m_col_strides;\ -const DenseIndex m_in_plane_strides;\ -const DenseIndex m_in_row_strides;\ -const DenseIndex m_in_col_strides;\ -const DenseIndex m_plane_inflate_strides;\ -const DenseIndex m_row_inflate_strides;\ -const DenseIndex m_col_inflate_strides;\ -const bool m_padding_explicit;\ -const DenseIndex m_padding_top_z;\ -const DenseIndex m_padding_bottom_z;\ -const DenseIndex m_padding_top;\ -const DenseIndex m_padding_bottom;\ -const DenseIndex m_padding_left;\ -const DenseIndex m_padding_right;\ -const PaddingType m_padding_type;\ -const typename Self::Scalar m_padding_value;\ -FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ -: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ - m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ - m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ - m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\ - m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\ - m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \ - m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ - m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\ -}; -SYCLEXTRFUNCVOLUMEPATCHOP(const) -SYCLEXTRFUNCVOLUMEPATCHOP() -#undef SYCLEXTRFUNCVOLUMEPATCHOP - - -/// template deduction function for FunctorExtractor -template <typename Evaluator> -auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { - return FunctorExtractor<Evaluator>(evaluator); -} -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h deleted file mode 100644 index a447c3f88..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ /dev/null @@ -1,248 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: eigen@codeplay.com -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// General include header of SYCL target for Tensor Module -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H - -namespace Eigen { -namespace TensorSycl { -namespace internal { - - template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ - OP op; - OutputAccessor aOut; - ptrdiff_t out_offset; - InputAccessor aI; - LocalAccessor scratch; - size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} - void operator()(cl::sycl::nd_item<1> itemID) { - size_t globalid = itemID.get_global(0); - size_t localid = itemID.get_local(0); - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - if (globalid < length) { - scratch[localid] = aI[globalid]; - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - if (globalid < length) { - auto min = (length < local) ? length : local; - for (size_t offset = min / 2; offset > 0; offset /= 2) { - if (localid < offset) { - auto accum = op.initialize(); - op.reduce(scratch[localid], &accum); - op.reduce(scratch[localid + offset], &accum); - op.finalize(accum); - scratch[localid]=accum; - //scratch[localid] += scratch[localid + offset]; - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - /* The final result will be stored in local id 0. */ - if (localid == 0) { - aI[itemID.get_group(0)] = scratch[localid]; - if((length<=local) && globalid ==0){ - auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; - } - } - } - } - - }; - -/// ReductionFunctor -template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { - public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); - auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=static_cast<Index>(itemID.get_global_linear_id()); - if (globalid< range) { - typename DeviceSelf::CoeffReturnType accum = functor.initialize(); - Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); - functor.finalize(accum); - output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; - } - } - private: - write_accessor output_accessor; - ptrdiff_t out_offset; - FunctorExpr functors; - Tuple_of_Acc tuple_of_accessors; - Dims dims; - Op functor; - Index range; -}; - -template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index> -class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { - public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; - typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; - ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, - Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); - auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=static_cast<Index>(itemID.get_global_linear_id()); - if (globalid< range) { - typename DeviceSelf::CoeffReturnType accum = functor.initialize(); - Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); - functor.finalize(accum); - output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; - } - } - private: - write_accessor output_accessor; - ptrdiff_t out_offset; - FunctorExpr functors; - Tuple_of_Acc tuple_of_accessors; - Dims dims; - Op functor; - Index range; - Index num_values_to_reduce; -}; - -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor{ -public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - OutAccessor tmp_global_accessor; - Index rng , remaining, red_factor; - Op op; - Dims dims; - FunctorExpr functors; - TupleType tuple_of_accessors; - - FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc) - :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} - - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - - tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)) - : static_cast<CoeffReturnType>(op.initialize()); - - if(remaining!=0 && globalid==0 ){ - // this will add the rest of input buffer when the input size is not devidable to red_factor. - auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>:: - reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); - auto accum = op.initialize(); - op.reduce(tmp_global_accessor.get_pointer()[0], &accum); - op.reduce(remaining_reduce, &accum); - op.finalize(accum); - tmp_global_accessor.get_pointer()[0]=accum; - - } - } -}; - -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{ -public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef Eigen::internal::SumReducer<CoeffReturnType> Op; - - OutAccessor tmp_global_accessor; - Index rng , remaining, red_factor; - Op op; - Dims dims; - FunctorExpr functors; - TupleType tuple_of_accessors; - - FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc) - :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} - - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - auto scale = (rng*red_factor) + remaining; - - tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale) - :static_cast<CoeffReturnType>(op.initialize())/scale; - - if(remaining!=0 && globalid==0 ){ - // this will add the rest of input buffer when the input size is not devidable to red_factor. - auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); - auto accum = op.initialize(); - tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale; - op.reduce(tmp_global_accessor.get_pointer()[0], &accum); - op.reduce(remaining_reduce, &accum); - op.finalize(accum); - tmp_global_accessor.get_pointer()[0]=accum/scale; - - } - } -}; - -} -} -} -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h deleted file mode 100644 index 234580c7c..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ /dev/null @@ -1,213 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclLeafCount.h - * - * \brief: - * The leaf count used the pre-order expression tree traverse in order to name - * count the number of leaf nodes in the expression - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { -/// \brief LeafCount used to counting terminal nodes. The total number of -/// leaf nodes is used by MakePlaceHolderExprHelper to find the order -/// of the leaf node in a expression tree at compile time. -template <typename Expr> -struct LeafCount; - -template<typename... Args> struct CategoryCount; - -template<> struct CategoryCount<> -{ - static const size_t Count =0; -}; - -template<typename Arg, typename... Args> -struct CategoryCount<Arg,Args...>{ - static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count; -}; - -/// specialisation of the \ref LeafCount struct when the node type is const TensorMap -#define SYCLTENSORMAPLEAFCOUNT(CVQual)\ -template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\ -struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\ - static const size_t Count =1;\ -}; - -SYCLTENSORMAPLEAFCOUNT(const) -SYCLTENSORMAPLEAFCOUNT() -#undef SYCLTENSORMAPLEAFCOUNT - -// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp -#define SYCLCATEGORYLEAFCOUNT(CVQual)\ -template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\ -struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {}; - -SYCLCATEGORYLEAFCOUNT(const) -SYCLCATEGORYLEAFCOUNT() -#undef SYCLCATEGORYLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception -#define SYCLSELECTOPLEAFCOUNT(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr>\ -struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {}; - -SYCLSELECTOPLEAFCOUNT(const) -SYCLSELECTOPLEAFCOUNT() -#undef SYCLSELECTOPLEAFCOUNT - - -/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp -#define SYCLLEAFCOUNTASSIGNOP(CVQual)\ -template <typename LHSExpr, typename RHSExpr>\ -struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {}; - -SYCLLEAFCOUNTASSIGNOP(const) -SYCLLEAFCOUNTASSIGNOP() -#undef SYCLLEAFCOUNTASSIGNOP - -/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp -#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\ -template <typename Expr>\ -struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\ - static const size_t Count =1;\ -}; - -SYCLFORCEDEVALLEAFCOUNT(const) -SYCLFORCEDEVALLEAFCOUNT() -#undef SYCLFORCEDEVALLEAFCOUNT - -#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ -template <typename CustomUnaryFunc, typename XprType>\ -struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\ -static const size_t Count =1;\ -}; - -SYCLCUSTOMUNARYOPLEAFCOUNT(const) -SYCLCUSTOMUNARYOPLEAFCOUNT() -#undef SYCLCUSTOMUNARYOPLEAFCOUNT - - -#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ -template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\ -struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\ -static const size_t Count =1;\ -}; -SYCLCUSTOMBINARYOPLEAFCOUNT( const) -SYCLCUSTOMBINARYOPLEAFCOUNT() -#undef SYCLCUSTOMBINARYOPLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ -template <typename Expr>\ -struct LeafCount<CVQual ExprNode<Expr> > {\ - static const size_t Count = Num + CategoryCount<Expr>::Count;\ -}; - -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) - -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) -EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) - -#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ -template <typename OP, typename Dim, typename Expr>\ -struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\ - static const size_t Count =1;\ -}; - -// TensorReductionOp -REDUCTIONLEAFCOUNT(const,TensorReductionOp) -REDUCTIONLEAFCOUNT(,TensorReductionOp) - -// tensor Argmax -TensorTupleReducerOp -REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) -REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) - -#undef REDUCTIONLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp -#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\ -template <typename Indices, typename LhsXprType, typename RhsXprType>\ -struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\ - static const size_t Count =1;\ -}; - -CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) -#undef CONTRACTIONCONVOLUTIONLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp -#define SLICEOPLEAFCOUNT(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType>\ -struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{}; - -SLICEOPLEAFCOUNT(const) -SLICEOPLEAFCOUNT() -#undef SLICEOPLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp -#define CHIPPINGOPLEAFCOUNT(CVQual)\ -template <DenseIndex DimId, typename XprType>\ -struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{}; - -CHIPPINGOPLEAFCOUNT(const) -CHIPPINGOPLEAFCOUNT() -#undef CHIPPINGOPLEAFCOUNT - -///TensorStridingSlicingOp -#define SLICESTRIDEOPLEAFCOUNT(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ -struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{}; - -SLICESTRIDEOPLEAFCOUNT(const) -SLICESTRIDEOPLEAFCOUNT() -#undef SLICESTRIDEOPLEAFCOUNT - -//TensorImagePatchOp -#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ -struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{}; - - -TENSORIMAGEPATCHOPLEAFCOUNT(const) -TENSORIMAGEPATCHOPLEAFCOUNT() -#undef TENSORIMAGEPATCHOPLEAFCOUNT - -// TensorVolumePatchOp -#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\ -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ -struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{}; - -TENSORVOLUMEPATCHOPLEAFCOUNT(const) -TENSORVOLUMEPATCHOPLEAFCOUNT() -#undef TENSORVOLUMEPATCHOPLEAFCOUNT - -} /// namespace TensorSycl -} /// namespace internal -} /// namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h deleted file mode 100644 index 9d5708fc5..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ /dev/null @@ -1,302 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclPlaceHolderExpr.h - * - * \brief: - * This is the specialisation of the placeholder expression based on the - * operation type - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -/// \struct PlaceHolder -/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression -/// tree. -/// PlaceHolder contains the order of the leaf node in the expression tree. -template <typename Scalar, size_t N> -struct PlaceHolder { - static constexpr size_t I = N; - typedef Scalar Type; -}; - -/// \sttruct PlaceHolderExpression -/// \brief it is used to create the PlaceHolder expression. The PlaceHolder -/// expression is a copy of expression type in which the TensorMap of the has -/// been replaced with PlaceHolder. -template <typename Expr, size_t N> -struct PlaceHolderExpression; - -template<size_t N, typename... Args> -struct CalculateIndex; - -template<size_t N, typename Arg> -struct CalculateIndex<N, Arg>{ - typedef typename PlaceHolderExpression<Arg, N>::Type ArgType; - typedef utility::tuple::Tuple<ArgType> ArgsTuple; -}; - -template<size_t N, typename Arg1, typename Arg2> -struct CalculateIndex<N, Arg1, Arg2>{ - static const size_t Arg2LeafCount = LeafCount<Arg2>::Count; - typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type; - typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type; - typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple; -}; - -template<size_t N, typename Arg1, typename Arg2, typename Arg3> -struct CalculateIndex<N, Arg1, Arg2, Arg3> { - static const size_t Arg3LeafCount = LeafCount<Arg3>::Count; - static const size_t Arg2LeafCount = LeafCount<Arg2>::Count; - typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type; - typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type; - typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type; - typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple; -}; - -template<template<class...> class Category , class OP, class TPL> -struct CategoryHelper; - -template<template<class...> class Category , class OP, class ...T > -struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > { - typedef Category<OP, T... > Type; -}; - -template<template<class...> class Category , class ...T > -struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > { - typedef Category<T... > Type; -}; - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp -#define OPEXPRCATEGORY(CVQual)\ -template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\ -struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\ - typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\ -}; - -OPEXPRCATEGORY(const) -OPEXPRCATEGORY() -#undef OPEXPRCATEGORY - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorCwiseSelectOp -#define SELECTEXPR(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\ - typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\ -}; - -SELECTEXPR(const) -SELECTEXPR() -#undef SELECTEXPR - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorAssignOp -#define ASSIGNEXPR(CVQual)\ -template <typename LHSExpr, typename RHSExpr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\ - typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\ -}; - -ASSIGNEXPR(const) -ASSIGNEXPR() -#undef ASSIGNEXPR - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorMap -#define TENSORMAPEXPR(CVQual)\ -template <typename T, int Options_, template <class> class MakePointer_, size_t N>\ -struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\ -}; - -TENSORMAPEXPR(const) -TENSORMAPEXPR() -#undef TENSORMAPEXPR - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorForcedEvalOp -#define FORCEDEVAL(CVQual)\ -template <typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\ -}; - -FORCEDEVAL(const) -FORCEDEVAL() -#undef FORCEDEVAL - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorForcedEvalOp -#define CUSTOMUNARYOPEVAL(CVQual)\ -template <typename CustomUnaryFunc, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\ -}; - -CUSTOMUNARYOPEVAL(const) -CUSTOMUNARYOPEVAL() -#undef CUSTOMUNARYOPEVAL - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorForcedEvalOp -#define CUSTOMBINARYOPEVAL(CVQual)\ -template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\ -}; - -CUSTOMBINARYOPEVAL(const) -CUSTOMBINARYOPEVAL() -#undef CUSTOMBINARYOPEVAL - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp -#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ -template <typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\ - typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\ -}; - -// TensorEvalToOp -EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) -EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) -//TensorLayoutSwapOp -EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) -EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) -//TensorIndexTupleOp -EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) -EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) - -#undef EVALTOLAYOUTSWAPINDEXTUPLE - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorChippingOp -#define CHIPPINGOP(CVQual)\ -template <DenseIndex DimId, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\ - typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\ -}; - -CHIPPINGOP(const) -CHIPPINGOP() -#undef CHIPPINGOP - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp and TensorTupleReducerOp (Argmax) -#define SYCLREDUCTION(CVQual, ExprNode)\ -template <typename OP, typename Dims, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\ - typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\ -}; - -// tensor reduction -SYCLREDUCTION(const, TensorReductionOp) -SYCLREDUCTION(, TensorReductionOp) - -// tensor Argmax -TensorTupleReducerOp -SYCLREDUCTION(const, TensorTupleReducerOp) -SYCLREDUCTION(, TensorTupleReducerOp) -#undef SYCLREDUCTION - - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ -template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\ -struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\ - typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\ -}; -SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTIONPLH - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorCwiseSelectOp -#define SLICEOPEXPR(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\ -}; - -SLICEOPEXPR(const) -SLICEOPEXPR() -#undef SLICEOPEXPR - - -#define SYCLSLICESTRIDEOPPLH(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\ -}; - -SYCLSLICESTRIDEOPPLH(const) -SYCLSLICESTRIDEOPPLH() -#undef SYCLSLICESTRIDEOPPLH - - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorImagePatchOp -#define SYCLTENSORIMAGEPATCHOP(CVQual)\ -template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\ - typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ -}; - -SYCLTENSORIMAGEPATCHOP(const) -SYCLTENSORIMAGEPATCHOP() -#undef SYCLTENSORIMAGEPATCHOP - - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorVolumePatchOp -#define SYCLTENSORVOLUMEPATCHOP(CVQual)\ -template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\ - typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ -}; - -SYCLTENSORVOLUMEPATCHOP(const) -SYCLTENSORVOLUMEPATCHOP() -#undef SYCLTENSORVOLUMEPATCHOP - - -/// template deduction for \ref PlaceHolderExpression struct -template <typename Expr> -struct createPlaceHolderExpression { - static const size_t TotalLeaves = LeafCount<Expr>::Count; - typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type; -}; - -} // internal -} // TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h deleted file mode 100644 index 29c78184d..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ /dev/null @@ -1,96 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Cummins Chris PhD student at The University of Edinburgh. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclRun.h - * - * \brief: - * Schedule_kernel invoke an specialised version of kernel struct. The - * specialisation is based on the data dimension in sycl buffer - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP - -namespace Eigen { -namespace TensorSycl { -template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ - typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; - - typedef typename Expr::Index Index; - FunctorExpr functors; - TupleType tuple_of_accessors; - Index range; - ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_) - : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){} - void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; - auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); - typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); - if (gId < range) - device_evaluator.evalScalar(gId); - } -}; - -/// The run function in tensor sycl convert the expression tree to a buffer -/// based expression tree; -/// creates the expression tree for the device with accessor to buffers; -/// construct the kernel and submit it to the sycl queue. -/// std::array does not have TotalSize. So I have to get the size through template specialisation. -template<typename , typename Dimensions> struct DimensionSize{ - static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){ - return dim.TotalSize(); - } -}; -#define DIMSIZEMACRO(CVQual)\ -template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\ - static inline Index getDimSize(const std::array<Index, NumDims>& dim){\ - return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\ - }\ -}; - -DIMSIZEMACRO(const) -DIMSIZEMACRO() -#undef DIMSIZEMACRO - - -template <typename Expr, typename Dev> -void run(Expr &expr, Dev &dev) { - Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr; - FunctorExpr functors = internal::extractFunctors(evaluator); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // create a tuple of accessors from Evaluator - typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType; - TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator); - typename Expr::Index range, GRange, tileSize; - typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions())); - dev.parallel_for_setup(total_size, tileSize, range, GRange); - - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range - , functors, tuple_of_accessors - )); - }); - dev.asynchronousExec(); - } - evaluator.cleanup(); -} -} // namespace TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h deleted file mode 100644 index 5385c7eac..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h +++ /dev/null @@ -1,239 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensroSyclTuple.h - * - * \brief: - * Minimal implementation of std::tuple that can be used inside a SYCL kernel. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP - -namespace utility { -namespace tuple { -/// \struct StaticIf -/// \brief The StaticIf struct is used to statically choose the type based on the -/// condition. -template <bool, typename T = void> struct StaticIf; -/// \brief specialisation of the \ref StaticIf when the condition is true -template <typename T> -struct StaticIf<true, T> { - typedef T type; -}; - -/// \struct Tuple -/// \brief is a fixed-size collection of heterogeneous values -/// \tparam Ts... - the types of the elements that the tuple stores. -/// Empty list is supported. -template <class... Ts> -struct Tuple {}; - -/// \brief specialisation of the \ref Tuple class when the tuple has at least -/// one element. -/// \tparam T : the type of the first element in the tuple. -/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty. -template <class T, class... Ts> -struct Tuple<T, Ts...> { - Tuple(T t, Ts... ts) : head(t), tail(ts...) {} - T head; - Tuple<Ts...> tail; -}; - -///\ struct ElemTypeHolder -/// \brief ElemTypeHolder class is used to specify the types of the -/// elements inside the tuple -/// \tparam size_t the number of elements inside the tuple -/// \tparam class the tuple class -template <size_t, class> -struct ElemTypeHolder; - -/// \brief specialisation of the \ref ElemTypeHolder class when the number of -/// elements inside the tuple is 1 -template <class T, class... Ts> -struct ElemTypeHolder<0, Tuple<T, Ts...> > { - typedef T type; -}; - -/// \brief specialisation of the \ref ElemTypeHolder class when the number of -/// elements inside the tuple is bigger than 1. It recursively calls itself to -/// detect the type of each element in the tuple -/// \tparam T : the type of the first element in the tuple. -/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty. -/// \tparam K is the Kth element in the tuple -template <size_t k, class T, class... Ts> -struct ElemTypeHolder<k, Tuple<T, Ts...> > { - typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type; -}; - -/// get -/// \brief Extracts the first element from the tuple. -/// K=0 represents the first element of the tuple. The tuple cannot be empty. -/// \tparam Ts... are the type of the elements in the tuple. -/// \param t is the tuple whose contents to extract -/// \return typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type - -#define TERMINATE_CONDS_TUPLE_GET(CVQual) \ -template <size_t k, class... Ts> \ -typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \ -get(CVQual Tuple<Ts...> &t) { \ - static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \ - return t.head; \ -} - -TERMINATE_CONDS_TUPLE_GET(const) -TERMINATE_CONDS_TUPLE_GET() -#undef TERMINATE_CONDS_TUPLE_GET -/// get -/// \brief Extracts the Kth element from the tuple. -///\tparam K is an integer value in [0,sizeof...(Types)). -/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple -/// \tparam Ts... are the type of the elements in the tuple. -/// \param t is the tuple whose contents to extract -/// \return typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type -#define RECURSIVE_TUPLE_GET(CVQual) \ -template <size_t k, class T, class... Ts> \ -typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \ -get(CVQual Tuple<T, Ts...> &t) { \ - return utility::tuple::get<k - 1>(t.tail); \ -} -RECURSIVE_TUPLE_GET(const) -RECURSIVE_TUPLE_GET() -#undef RECURSIVE_TUPLE_GET - -/// make_tuple -/// \brief Creates a tuple object, deducing the target type from the types of -/// arguments. -/// \tparam Args the type of the arguments to construct the tuple from -/// \param args zero or more arguments to construct the tuple from -/// \return Tuple<Args...> -template <typename... Args> -Tuple<Args...> make_tuple(Args... args) { - return Tuple<Args...>(args...); -} - -/// size -/// \brief Provides access to the number of elements in a tuple as a -/// compile-time constant expression. -/// \tparam Args the type of the arguments to construct the tuple from -/// \return size_t -template <typename... Args> -static constexpr size_t size(Tuple<Args...> &) { - return sizeof...(Args); -} - -/// \struct IndexList -/// \brief Creates a list of index from the elements in the tuple -/// \tparam Is... a list of index from [0 to sizeof...(tuple elements)) -template <size_t... Is> -struct IndexList {}; - -/// \struct RangeBuilder -/// \brief Collects internal details for generating index ranges [MIN, MAX) -/// Declare primary template for index range builder -/// \tparam MIN is the starting index in the tuple -/// \tparam N represents sizeof..(elements)- sizeof...(Is) -/// \tparam Is... are the list of generated index so far -template <size_t MIN, size_t N, size_t... Is> -struct RangeBuilder; - -// FIXME Doxygen has problems with recursive inheritance -#ifndef EIGEN_PARSED_BY_DOXYGEN -/// \brief base Step: Specialisation of the \ref RangeBuilder when the -/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements)) -/// \tparam MIN is the starting index of the tuple -/// \tparam Is is [0 to sizeof...(tuple elements)) -template <size_t MIN, size_t... Is> -struct RangeBuilder<MIN, MIN, Is...> { - typedef IndexList<Is...> type; -}; - -/// Induction step: Specialisation of the RangeBuilder class when N!=MIN -/// in this case we are recursively subtracting N by one and adding one -/// index to Is... list until MIN==N -/// \tparam MIN is the starting index in the tuple -/// \tparam N represents sizeof..(elements)- sizeof...(Is) -/// \tparam Is... are the list of generated index so far -template <size_t MIN, size_t N, size_t... Is> -struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {}; -#endif // EIGEN_PARSED_BY_DOXYGEN - -/// \brief IndexRange that returns a [MIN, MAX) index range -/// \tparam MIN is the starting index in the tuple -/// \tparam MAX is the size of the tuple -template <size_t MIN, size_t MAX> -struct IndexRange: RangeBuilder<MIN, MAX>::type {}; - -/// append_base -/// \brief unpacking the elements of the input tuple t and creating a new tuple -/// by adding element a at the end of it. -///\tparam Args... the type of the elements inside the tuple t -/// \tparam T the type of the new element going to be added at the end of tuple -/// \tparam I... is the list of index from [0 to sizeof...(t)) -/// \param t the tuple on which we want to append a. -/// \param a the new elements going to be added to the tuple -/// \return Tuple<Args..., T> -template <typename... Args, typename T, size_t... I> -Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) { - return utility::tuple::make_tuple(get<I>(t)..., a); -} - -/// append -/// \brief the deduction function for \ref append_base that automatically -/// generate the \ref IndexRange -///\tparam Args... the type of the elements inside the tuple t -/// \tparam T the type of the new element going to be added at the end of tuple -/// \param t the tuple on which we want to append a. -/// \param a the new elements going to be added to the tuple -/// \return Tuple<Args..., T> -template <typename... Args, typename T> -Tuple<Args..., T> append(Tuple<Args...> t, T a) { - return utility::tuple::append_base(t, a, IndexRange<0, sizeof...(Args)>()); -} - -/// append_base -/// \brief This is a specialisation of \ref append_base when we want to -/// concatenate -/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the -/// IndexRange for each of them and create an output tuple T that contains both -/// elements of t1 and t2. -///\tparam Args1... the type of the elements inside the tuple t1 -///\tparam Args2... the type of the elements inside the tuple t2 -/// \tparam I1... is the list of index from [0 to sizeof...(t1)) -/// \tparam I2... is the list of index from [0 to sizeof...(t2)) -/// \param t1 is the tuple on which we want to append t2. -/// \param t2 is the tuple that is going to be added on t1. -/// \return Tuple<Args1..., Args2...> -template <typename... Args1, typename... Args2, size_t... I1, size_t... I2> -Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) { - return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...); -} - -/// append -/// \brief deduction function for \ref append_base when we are appending tuple -/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are -/// automatically generated. -///\tparam Args1... the type of the elements inside the tuple t1 -///\tparam Args2... the type of the elements inside the tuple t2 -/// \param t1 is the tuple on which we want to append t2. -/// \param t2 is the tuple that is going to be added on t1. -/// \return Tuple<Args1..., Args2...> -template <typename... Args1, typename... Args2> -Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) { - return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>()); -} -} // tuple -} // utility - -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP diff --git a/unsupported/doc/Overview.dox b/unsupported/doc/Overview.dox index 45464a545..bae51dcf6 100644 --- a/unsupported/doc/Overview.dox +++ b/unsupported/doc/Overview.dox @@ -11,6 +11,8 @@ Click on the \e Modules tab at the top of this page to get a list of all unsuppo Don't miss the <a href="../index.html">official Eigen documentation</a>. + \subpage SYCL_EIGEN "SYCL backend for Eigen" + */ /* @@ -26,3 +28,4 @@ subject to be included in %Eigen in the future. /// \internal \brief Namespace containing low-level routines from the %Eigen library. namespace internal {} } + diff --git a/unsupported/doc/SYCL.dox b/unsupported/doc/SYCL.dox new file mode 100644 index 000000000..2295adf21 --- /dev/null +++ b/unsupported/doc/SYCL.dox @@ -0,0 +1,9 @@ +/** \page SYCL_EIGEN Eigen SYCL Backend + +Useful information for Eigen SYCL Backend: + +- <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> + +- <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a> + +*/ diff --git a/unsupported/doc/examples/CMakeLists.txt b/unsupported/doc/examples/CMakeLists.txt index bee2b8ad4..7bb67736c 100644 --- a/unsupported/doc/examples/CMakeLists.txt +++ b/unsupported/doc/examples/CMakeLists.txt @@ -18,3 +18,7 @@ foreach(example_src ${examples_SRCS}) ) add_dependencies(unsupported_examples example_${example}) endforeach(example_src) + +if(EIGEN_TEST_SYCL) + add_subdirectory(SYCL) +endif(EIGEN_TEST_SYCL) diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt new file mode 100644 index 000000000..bef4f1925 --- /dev/null +++ b/unsupported/doc/examples/SYCL/CMakeLists.txt @@ -0,0 +1,38 @@ +FILE(GLOB examples_SRCS "*.cpp") + +set(EIGEN_SYCL ON) +list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread) +if(EIGEN_SYCL_TRISYCL) + set(CMAKE_CXX_STANDARD 14) + set(STD_CXX_FLAG "-std=c++1z") +else(EIGEN_SYCL_TRISYCL) + if(MSVC) + # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 + # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers. + set(CMAKE_CXX_STANDARD 14) + list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) + else() + set(CMAKE_CXX_STANDARD 11) + list(APPEND COMPUTECPP_USER_FLAGS -Wall) + endif() + # The following flags are not supported by Clang and can cause warnings + # if used with -Werror so they are removed here. + if(COMPUTECPP_USE_COMPILER_DRIVER) + set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}) + string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + endif() + list(APPEND COMPUTECPP_USER_FLAGS + -DEIGEN_NO_ASSERTION_CHECKING=1 + -no-serial-memop + -Xclang + -cl-mad-enable) +endif(EIGEN_SYCL_TRISYCL) + +FOREACH(example_src ${examples_SRCS}) + GET_FILENAME_COMPONENT(example ${example_src} NAME_WE) + ei_add_test_internal(${example} example_${example}) + ADD_DEPENDENCIES(unsupported_examples example_${example}) +ENDFOREACH(example_src) +set(EIGEN_SYCL OFF) diff --git a/unsupported/doc/examples/SYCL/CwiseMul.cpp b/unsupported/doc/examples/SYCL/CwiseMul.cpp new file mode 100644 index 000000000..31eb104c6 --- /dev/null +++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp @@ -0,0 +1,63 @@ +#include <iostream> +#define EIGEN_USE_SYCL +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +int main() +{ + using DataType = float; + using IndexType = int64_t; + constexpr auto DataLayout = Eigen::RowMajor; + + auto devices = Eigen::get_sycl_supported_devices(); + const auto device_selector = *devices.begin(); + Eigen::QueueInterface queueInterface(device_selector); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + + // create the tensors to be used in the operation + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 3; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + + // initialize the tensors with the data we want manipulate to + Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); + + // set up some random data in the tensors to be multiplied + in1 = in1.random(); + in2 = in2.random(); + + // allocate memory for the tensors + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + + // + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); + + // copy the memory to the device and do the c=a*b calculation + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + + // print out the results + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) + << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n"; + } + } + } + printf("c=a*b Done\n"); +}
\ No newline at end of file diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3d9ac9263..9db965ad8 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -111,40 +111,113 @@ ei_add_test(special_functions) if(EIGEN_TEST_CXX11) if(EIGEN_TEST_SYCL) + set(EIGEN_SYCL ON) + # Forward CMake options as preprocessor definitions + if(EIGEN_SYCL_USE_DEFAULT_SELECTOR) + add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR}) + endif() + if(EIGEN_SYCL_NO_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_MAX_GLOBAL_RANGE) + add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM0) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM1) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1}) + endif() + if(EIGEN_SYCL_REG_M) + add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M}) + endif() + if(EIGEN_SYCL_REG_N) + add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N}) + endif() + if(EIGEN_SYCL_USE_PROGRAM_CLASS) + add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS}) + endif() + if(EIGEN_SYCL_ASYNC_EXECUTION) + add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION}) + endif() + if(EIGEN_SYCL_DISABLE_SKINNY) + add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY}) + endif() + if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER) + add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER}) + endif() + if(EIGEN_SYCL_DISABLE_RANK1) + add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1}) + endif() + if(EIGEN_SYCL_DISABLE_SCALAR) + add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR}) + endif() + if(EIGEN_SYCL_DISABLE_GEMV) + add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV}) + endif() + if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION) + add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION}) + endif() + if(EIGEN_SYCL_TRISYCL) set(CMAKE_CXX_STANDARD 14) set(STD_CXX_FLAG "-std=c++1z") else() - # It should be safe to always run these tests as there is some fallback code for - # older compiler that don't support cxx11. - # This is already set if EIGEN_TEST_CXX11 is enabled: - # set(CMAKE_CXX_STANDARD 11) - # set(STD_CXX_FLAG "-std=c++11") + if(MSVC) + # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 + # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers. + set(CMAKE_CXX_STANDARD 14) + list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) + else() + set(CMAKE_CXX_STANDARD 11) + list(APPEND COMPUTECPP_USER_FLAGS -Wall) + endif() + # The following flags are not supported by Clang and can cause warnings + # if used with -Werror so they are removed here. + if(COMPUTECPP_USE_COMPILER_DRIVER) + set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}) + string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + endif() + list(APPEND COMPUTECPP_USER_FLAGS + -DEIGEN_NO_ASSERTION_CHECKING=1 + -no-serial-memop + -Xclang + -cl-mad-enable) endif() - ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG}) + set(EIGEN_SYCL OFF) endif() ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 0bbb0f6dc..41ea3cf7b 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -18,6 +18,7 @@ #define EIGEN_USE_SYCL #include "main.h" + #include <unsupported/Eigen/CXX11/Tensor> using Eigen::array; @@ -26,9 +27,8 @@ using Eigen::Tensor; using Eigen::TensorMap; template <typename DataType, int Layout, typename DenseIndex> -static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ - - Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}}); +static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) { + Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}}); Tensor<DenseIndex, 0, Layout, DenseIndex> out_max; Tensor<DenseIndex, 0, Layout, DenseIndex> out_min; in.setRandom(); @@ -39,14 +39,15 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ std::size_t in_bytes = in.size() * sizeof(DataType); std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); - DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}}); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, + Eigen::array<DenseIndex, 3>{{2, 2, 2}}); Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max); Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min); - sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes); gpu_out_max.device(sycl_device) = gpu_in.argmax(); gpu_out_min.device(sycl_device) = gpu_in.argmin(); @@ -54,7 +55,7 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); - VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1); + VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1); VERIFY_IS_EQUAL(out_min(), 0); sycl_device.deallocate(d_in); @@ -62,22 +63,22 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ sycl_device.deallocate(d_out_min); } - template <typename DataType, int DataLayout, typename DenseIndex> -static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) -{ - DenseIndex sizeDim0=9; - DenseIndex sizeDim1=3; - DenseIndex sizeDim2=5; - DenseIndex sizeDim3=7; - Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); +static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); std::vector<DenseIndex> dims; - dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); for (DenseIndex dim = 0; dim < 4; ++dim) { - array<DenseIndex, 3> out_shape; - for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); @@ -86,9 +87,13 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix)=(ix[dim] != 0)?-1.0:10.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) + // = 10.0 + tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0; } } } @@ -97,23 +102,23 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) std::size_t in_bytes = tensor.size() * sizeof(DataType); std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); - DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); - DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in( + d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmax(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), - size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); } sycl_device.synchronize(); @@ -122,15 +127,18 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0; + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0; } } } } - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmax(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); @@ -144,20 +152,21 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) } template <typename DataType, int DataLayout, typename DenseIndex> -static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) -{ - DenseIndex sizeDim0=9; - DenseIndex sizeDim1=3; - DenseIndex sizeDim2=5; - DenseIndex sizeDim3=7; - Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); +static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); std::vector<DenseIndex> dims; - dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); for (DenseIndex dim = 0; dim < 4; ++dim) { - array<DenseIndex, 3> out_shape; - for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); @@ -166,9 +175,12 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix)=(ix[dim] != 0)?1.0:-10.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0 + tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0; } } } @@ -177,23 +189,23 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) std::size_t in_bytes = tensor.size() * sizeof(DataType); std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); - DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); - DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in( + d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmin(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), - size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); } sycl_device.synchronize(); @@ -202,15 +214,18 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0 + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0; } } } } - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmin(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); @@ -223,10 +238,8 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) } } - - - -template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){ +template <typename DataType, typename Device_Selector> +void sycl_argmax_test_per_device(const Device_Selector& d) { QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device); @@ -238,8 +251,7 @@ template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_ } EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_argmax_test_per_device<double>(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device<float>(device)); } - } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index db2975783..72cb62fd5 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -25,243 +25,330 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -namespace std { -template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); } +// Functions used to compare the TensorMap implementation on the device with +// the equivalent on the host +namespace cl { +namespace sycl { +template <typename T> T abs(T x) { return cl::sycl::fabs(x); } template <typename T> T square(T x) { return x * x; } template <typename T> T cube(T x) { return x * x * x; } -template <typename T> T inverse(T x) { return 1 / x; } +template <typename T> T inverse(T x) { return T(1) / x; } +template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } +template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } } +} + +struct EqualAssignement { + template <typename Lhs, typename Rhs> + void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } +}; + +struct PlusEqualAssignement { + template <typename Lhs, typename Rhs> + void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } +}; -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ - { \ - /* out OPERATOR in.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in = in.random() + static_cast<SCALAR>(0.01); \ - out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data = static_cast<SCALAR *>( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(in(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ - } \ - { \ - /* out OPERATOR out.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(reference(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_out); \ +template <typename DataType, int DataLayout, + typename Assignement, typename Operator> +void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + Operator op; + Assignement asgn; + { + /* Assignement(out, Operator(in)) */ + Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in = in.random() + DataType(0.01); + out = out.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data = static_cast<DataType *>( + sycl_device.allocate(in.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(in(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); } + { + /* Assignement(out, Operator(out)) */ + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + out = out.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu_out)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(reference(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data_out); + } +} + +#define DECLARE_UNARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template <typename T> \ + auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ + return cl::sycl::FUNC(x); \ + } \ + template <typename T> \ + auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \ + return x.FUNC(); \ + } \ + }; -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) +DECLARE_UNARY_STRUCT(abs) +DECLARE_UNARY_STRUCT(sqrt) +DECLARE_UNARY_STRUCT(rsqrt) +DECLARE_UNARY_STRUCT(square) +DECLARE_UNARY_STRUCT(cube) +DECLARE_UNARY_STRUCT(inverse) +DECLARE_UNARY_STRUCT(tanh) +DECLARE_UNARY_STRUCT(exp) +DECLARE_UNARY_STRUCT(expm1) +DECLARE_UNARY_STRUCT(log) +DECLARE_UNARY_STRUCT(ceil) +DECLARE_UNARY_STRUCT(floor) +DECLARE_UNARY_STRUCT(round) +DECLARE_UNARY_STRUCT(log1p) +DECLARE_UNARY_STRUCT(sign) +DECLARE_UNARY_STRUCT(isnan) +DECLARE_UNARY_STRUCT(isfinite) +DECLARE_UNARY_STRUCT(isinf) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ - { \ - /* out = in.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ - Tensor<bool, 3, Layout, int64_t> out(tensorRange); \ - in = in.random() + static_cast<SCALAR>(0.01); \ - SCALAR *gpu_data = static_cast<SCALAR *>( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - bool *gpu_data_out = \ - static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(bool)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ +template <typename DataType, int DataLayout, typename Assignement> +void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { +#define RUN_UNARY_TEST(FUNC) \ + test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \ + op_##FUNC>(sycl_device, tensor_range) + RUN_UNARY_TEST(abs); + RUN_UNARY_TEST(sqrt); + RUN_UNARY_TEST(rsqrt); + RUN_UNARY_TEST(square); + RUN_UNARY_TEST(cube); + RUN_UNARY_TEST(inverse); + RUN_UNARY_TEST(tanh); + RUN_UNARY_TEST(exp); + RUN_UNARY_TEST(expm1); + RUN_UNARY_TEST(log); + RUN_UNARY_TEST(ceil); + RUN_UNARY_TEST(floor); + RUN_UNARY_TEST(round); + RUN_UNARY_TEST(log1p); + RUN_UNARY_TEST(sign); +} + +template <typename DataType, int DataLayout, typename Operator> +void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in) */ + Operator op; + Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); + Tensor<bool, 3, DataLayout, int64_t> out(tensor_range); + in = in.random() + DataType(0.01); + DataType *gpu_data = static_cast<DataType *>( + sycl_device.allocate(in.size() * sizeof(DataType))); + bool *gpu_data_out = + static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); + TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(bool)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_EQUAL(out(i), op(in(i))); } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) +template <typename DataType, int DataLayout> +void test_unary_builtins(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + test_unary_builtins_for_assignement<DataType, DataLayout, + PlusEqualAssignement>(sycl_device, tensor_range); + test_unary_builtins_for_assignement<DataType, DataLayout, + EqualAssignement>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isnan>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isfinite>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isinf>(sycl_device, tensor_range); +} +template <typename DataType> static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float, RowMajor) - TEST_UNARY_BUILTINS(float, ColMajor) + test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range); + test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range); } -namespace std { -template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); } -template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } +template <typename DataType, int DataLayout, typename Operator> +void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in_1, in_2) */ + Operator op; + Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in_1 = in_1.random() + DataType(0.01); + in_2 = in_2.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_1 = static_cast<DataType *>( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_2 = static_cast<DataType *>( + sycl_device.allocate(in_2.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), + (in_2.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, gpu_2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i))); + } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_2); + sycl_device.deallocate(gpu_data_out); } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ - { \ - /* out = in_1.FUNC(in_2) */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver = std::FUNC(in_1(i), in_2(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ +template <typename DataType, int DataLayout, typename Operator> +void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in_1, 2) */ + Operator op; + const DataType arg2(2); + Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in_1 = in_1.random(); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_1 = static_cast<DataType *>( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, arg2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), arg2)); } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR in_2 */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ - } +#define DECLARE_BINARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template <typename T1, typename T2> \ + auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \ + return cl::sycl::FUNC(x, y); \ + } \ + template <typename T1, typename T2> \ + auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \ + return x.FUNC(y); \ + } \ + }; -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR 2 */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_out); \ - } +DECLARE_BINARY_STRUCT(cwiseMax) +DECLARE_BINARY_STRUCT(cwiseMin) -#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) +#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ + struct op_##NAME { \ + template <typename T1, typename T2> \ + auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \ + return x OPERATOR y; \ + } \ + }; + +DECLARE_BINARY_STRUCT_OP(plus, +) +DECLARE_BINARY_STRUCT_OP(minus, -) +DECLARE_BINARY_STRUCT_OP(times, *) +DECLARE_BINARY_STRUCT_OP(divide, /) +DECLARE_BINARY_STRUCT_OP(modulo, %) + +template <typename DataType, int DataLayout> +void test_binary_builtins(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + test_binary_builtins_func<DataType, DataLayout, + op_cwiseMax>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_cwiseMin>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_plus>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_minus>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_times>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_divide>(sycl_device, tensor_range); +} + +template <typename DataType> +static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range); + test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range); +} -static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { +template <typename DataType> +static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float, RowMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) - TEST_BINARY_BUILTINS(float, ColMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins_fixed_arg2<DataType, RowMajor, + op_modulo>(sycl_device, tensor_range); + test_binary_builtins_fixed_arg2<DataType, ColMajor, + op_modulo>(sycl_device, tensor_range); } EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { QueueInterface queueInterface(device); Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device)); + CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device)); + CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp index a91efe00c..1e7093104 100644 --- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -419,6 +419,7 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) const size_t tensorBuffSize =tensor.size()*sizeof(DataType); const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); + std::cout << tensorBuffSize << " , "<< input2TensorBuffSize << std::endl; DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize)); @@ -605,14 +606,14 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); + /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device); - test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device); + test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/ test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device); + // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device); } EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl) { diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index c8e86e69f..fbcc29358 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -17,23 +17,27 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL -#include <iostream> +#include <algorithm> #include <chrono> #include <ctime> +#include <iostream> #include "main.h" + #include <unsupported/Eigen/CXX11/Tensor> using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; -// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void static test_sycl_contraction(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU @@ -41,7 +45,6 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); -// Eigen::array<DimPair, 1> dims(DimPair(1, 0)); Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; @@ -50,117 +53,217 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); std::size_t t_right_bytes = t_right.size() * sizeof(DataType); std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, result_dims); - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); t_result = t_left.contract(t_right, dims); for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { continue; } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); + + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); } -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_TF(const Device& sycl_device) -{ - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; - Eigen::array<IndexType, 2> left_dims = {{2, 3}}; - Eigen::array<IndexType, 2> right_dims = {{3, 1}}; - Eigen::array<IndexType, 2> res_dims = {{2, 1}}; - Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_m(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, + 128); + } +} +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_k(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, + 128); + } +} - Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_n(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, + 128, k); + } +} - t_left.data()[0] = 1.0f; - t_left.data()[1] = 2.0f; - t_left.data()[2] = 3.0f; - t_left.data()[3] = 4.0f; - t_left.data()[4] = 5.0f; - t_left.data()[5] = 6.0f; +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_sizes(const Device &sycl_device) { + IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; - t_right.data()[0] = -1.0f; - t_right.data()[1] = 0.5f; - t_right.data()[2] = 2.0f; + IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); - std::size_t t_right_bytes = t_right.size() * sizeof(DataType); - std::size_t t_result_bytes = t_result.size()*sizeof(DataType); + IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129, + 255, 257, 511, 512, 513, 1023, 1024, 1025}; + + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>( + sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + } + } + } +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); + + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; + t_left.setRandom(); + t_right.setRandom(); - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + // Allocate buffers twice as big to check for invalid read and write + auto padded_left_size = 2 * t_left.size(); + auto padded_right_size = 2 * t_right.size(); + auto padded_result_size = 2 * t_result.size(); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims); + std::size_t t_left_bytes = padded_left_size * sizeof(DataType); + std::size_t t_right_bytes = padded_right_size * sizeof(DataType); + std::size_t t_result_bytes = padded_result_size * sizeof(DataType); - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + // TensorMaps are still of the same size than the Tensors + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, result_dims); + + // Write nan after the actual buffer to propagate nans everywhere in case of + // invalid reads + DataType nan = std::numeric_limits<DataType>::quiet_NaN(); + auto host_left_data = new DataType[padded_left_size]; + std::copy_n(t_left.data(), t_left.size(), host_left_data); + std::fill_n(host_left_data + t_left.size(), t_left.size(), nan); + auto host_right_data = new DataType[padded_right_size]; + std::copy_n(t_right.data(), t_right.size(), host_right_data); + std::fill_n(host_right_data + t_right.size(), t_right.size(), nan); + auto host_result_data = new DataType[padded_result_size]; + std::fill_n(host_result_data, padded_result_size, nan); + + sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes); t_result = t_left.contract(t_right, dims); for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - host_result_data[i]))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { + if (Eigen::internal::isApprox(t_result(i), host_result_data[i], + error_threshold)) { continue; } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); + if (std::isnan(host_result_data[i])) { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid read detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } else { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); + } + // Make sure that the rest of the result is still nans + for (IndexType i = t_result.size(); i < padded_result_size; i++) { + if (std::isnan(host_result_data[i])) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid write detected at IndexType " << i << ": " + << host_result_data[i] << std::endl; + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); } sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); - + delete[] host_left_data; + delete[] host_right_data; + delete[] host_result_data; } -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size, + IndexType n_size) { + // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << + // ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); Tensor<DataType, 0, DataLayout, IndexType> t_result; @@ -171,32 +274,40 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); std::size_t t_right_bytes = t_right.size() * sizeof(DataType); std::size_t t_result_bytes = sizeof(DataType); + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>> + gpu_t_result(d_t_result); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result); - - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); t_result = t_left.contract(t_right, dims); - if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold && + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result() - t_result_gpu()))) > error_threshold && !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { - std::cout << "mismatch detected: " << t_result() - << " vs " << t_result_gpu() << std::endl; - assert(false); + std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size + << " : mismatch detected: " << t_result() << " vs " + << t_result_gpu() << std::endl; + VERIFY_IS_APPROX(t_result_gpu(), t_result()); } sycl_device.deallocate(d_t_left); @@ -204,87 +315,712 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, sycl_device.deallocate(d_t_result); } +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_batch(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, IndexType m_batch, + IndexType start, IndexType limit) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + typedef Eigen::array<IndexType, 3> TensorDim; + typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType; + TensorDim left_dims = {{m_batch, k_size, m_size}}; + TensorDim right_dims = {{m_batch, n_size, k_size}}; + TensorDim res_dims = {{m_batch, m_size, n_size}}; + Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}}; -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_m(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128); + TensorType t_left(left_dims); + TensorType t_right(right_dims); + TensorType t_result_gpu(res_dims); + TensorType t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + for (int i = start; i < limit; ++i) { + auto x = gpu_t_left.template chip<0>(i); + auto y = gpu_t_right.template chip<0>(i); + auto z = gpu_t_result.template chip<0>(i); + z.device(sycl_device) = x.contract(y, contract_pairs); + } + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + for (int i = start; i < limit; ++i) { + auto x = t_left.template chip<0>(i); + auto y = t_right.template chip<0>(i); + auto z = t_result.template chip<0>(i); + z = x.contract(y, contract_pairs); + } + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_k(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128); +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}}; + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType j = 0; j < m_size; j++) { + for (IndexType i = 0; i < n_size; i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType m: " << j << " n: " << i + << " CPU : " << t_result(j, i) + << " vs SYCL:" << t_result_gpu(j, i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i)); + } } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_n(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k); +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}}; + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_both_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; + Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}}; -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_sizes(const Device& sycl_device) { - IndexType m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); - IndexType n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; + t_left.setRandom(); + t_right.setRandom(); - IndexType k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - for (IndexType i = 0; i < 15; i++) { - for (IndexType j = 0; j < 15; j++) { - for (IndexType k = 0; k < 17; k++) { - test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); - } + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <typename Dev> +void inline tensorOutofBound(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Test out of bound for Tensor-Tensor + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, + 1024); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, + 4096); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024, + 2048); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, + 1024); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024, + 784); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, + 10); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096, + 513); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024, + 783); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, + 784); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024, + 11); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor out of bound tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128, + 128); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128, + 128); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_m(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_n(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; } -template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){ - QueueInterface queueInterface(s); - auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32); - test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); - test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32); - test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); +template <typename Dev> +void inline tensorTensor_k(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device); - test_TF<RowMajor, float, int64_t>(sycl_device); - test_TF<ColMajor, float, int64_t>(sycl_device); + test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_sizes(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} +template <typename Dev> +void inline vectorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // VECTOR-VECTOR + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1, + 1025); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1, + 1025); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1, + 1024); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1, + 1024); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1, + 1023); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1, + 1023); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "contracted tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline vectorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Vector-Tensor + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025, + 1025); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025, + 1025); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024, + 1024); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024, + 1024); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023, + 1023); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023, + 1023); + + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097, + 4097); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097, + 4097); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096, + 4096); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096, + 4096); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095, + 4095); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095, + 4095); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816, + 32); end = std::chrono::system_clock::now(); - std::chrono::duration<double> elapsed_seconds = end-start; + std::chrono::duration<double> elapsed_seconds = end - start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); std::cout << "finished computation at " << std::ctime(&end_time) << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Matrix-Vector + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095, + 1); +// If the GEMV disabled it will creates one kernel to calculate the contraction. +// Therefore the acumuation of float number will overflow the precision +// threshold for float and cause the test to fail. While it the GMV multiple +// kernel will be created and each one run the overflow of accumutation breaks +// among the kernels. +#ifndef EIGEN_SYCL_DISABLE_GEMV + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032, + 1); +#endif + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorScalar(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // SCALAR Contraction + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127); + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128); + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline skinnyTensor_row(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073, + 257); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072, + 256); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073, + 16); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline skinnyTensor_col(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073, + 257); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072, + 256); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073, + 16); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} +template <typename Dev> +void inline tensor_contraction_batch_per_device(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, + 0, 4); + contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, + 0, 4); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_lhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4, + 8); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, + 64); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784, + 2048, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, + 10, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, + 1024, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, + 4096, 1024); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_rhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4, + 16); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, + 17); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, + 64); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10, + 1024, 1024); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, + 1024, 4096); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, + 1024, 2048); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, + 1024, 784); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_both_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, + 17); + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, + 16, 64); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; } EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorContractionPerDevice(device)); + for (const auto &device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(tensorOutofBound(sycl_device)); + CALL_SUBTEST_2(tensorTensor(sycl_device)); + CALL_SUBTEST_2(tensorTensor_m(sycl_device)); + CALL_SUBTEST_2(tensorTensor_n(sycl_device)); + CALL_SUBTEST_2(tensorTensor_k(sycl_device)); + CALL_SUBTEST_2(tensorTensor_sizes(sycl_device)); + CALL_SUBTEST_3(vectorVector(sycl_device)); + CALL_SUBTEST_4(vectorTensor(sycl_device)); + CALL_SUBTEST_5(tensorVector(sycl_device)); + CALL_SUBTEST_6(tensorScalar(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_row(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_col(sycl_device)); + CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device)); + CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp index cc3b02448..d947ead83 100644 --- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -80,6 +80,8 @@ static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) VERIFY_IS_EQUAL(out(i, j), 0); } } + sycl_device.deallocate(gpu_in1_data); +sycl_device.deallocate(gpu_out_data); } template<typename TensorType> @@ -147,6 +149,9 @@ static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) } } } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); } template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){ diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 74d38a644..a55a5ad8a 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -36,8 +36,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); + in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f)); + in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f)); // creating TensorMap from tensor Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); @@ -72,5 +72,6 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorForced_evalperDevice<float>(device)); + CALL_SUBTEST(tensorForced_evalperDevice<half>(device)); } } diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp new file mode 100644 index 000000000..db1c0206e --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template <typename DataType, int DataLayout, typename IndexType> +static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 245; + IndexType sizeDim2 = 343; + IndexType sizeDim3 = 577; + + array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}}; + + Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range); + Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range); + Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range); + Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range); + + + + typedef Eigen::DSizes<IndexType, 3> Index3; + Index3 strides1(1L,1L, 1L); + Index3 indicesStart1(1L, 0L, 0L); + Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3); + + Index3 strides2(1L,1L, 1L); + Index3 indicesStart2(0L, 0L, 0L); + Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3); + Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3); + + tensor1.setRandom(); + tensor2.setRandom(); + + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType)); + gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes); + sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType)); + + tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2); + + + for (IndexType i = 0; i <slice_range[0] ; ++i) { + for (IndexType j = 0; j < slice_range[1]; ++j) { + for (IndexType k = 0; k < slice_range[2]; ++k) { + VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + + +template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_computing_test_per_device<double>(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp new file mode 100644 index 000000000..029653e27 --- /dev/null +++ b/unsupported/test/cxx11_tensor_math_sycl.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template <typename DataType, int DataLayout, typename IndexType> +static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.tanh(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.tanh(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.sigmoid(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.sigmoid(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} + + +template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 93dabe3ec..bf001b40f 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -180,6 +180,82 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device) sycl_device.deallocate(gpu_data3); } + +template <typename DataType, int DataLayout, typename IndexType> +static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + typedef Eigen::DSizes<IndexType, 5> Index5; + Index5 strides(1L,1L,1L,1L,1L); + Index5 indicesStart(1L,2L,3L,4L,5L); + Index5 indicesStop(2L,3L,4L,5L,6L); + Index5 lengths(1L,1L,1L,1L,1L); + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange); + tensor.setRandom(); + + array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; + Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); + Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); + DataType* gpu_data_stride2 = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range); + + Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); + Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.slice(indices, sizes); + sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); + + gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides); + sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5)); + + array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; + Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); + Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range); + + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); + DataType* gpu_data_stride3 = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range); + Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); + Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); + Index5 strides2(1L,1L,1L,1L,1L); + Index5 indicesStart2(1L,1L,3L,4L,5L); + Index5 indicesStop2(2L,2L,5L,6L,8L); + + gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); + sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); + + gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2); + sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + template<typename DataType, int DataLayout, typename IndexType> static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) { @@ -228,6 +304,65 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) sycl_device.deallocate(gpu_data3); } +template <typename OutIndex, typename DSizes> +Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) { + Eigen::array<OutIndex, DSizes::count> out; + for (int i = 0; i < DSizes::count; ++i) { + out[i] = in[i]; + } + return out; +} + +template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType> +int run_eigen(const SyclDevice& sycl_device) { + using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>; + using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>; + using TensorMI64 = TensorMap<TensorI64>; + using TensorMI32 = TensorMap<TensorI32>; + Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}}; + Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}}; + + TensorI64 out_tensor_gpu(tensor_range); + TensorI64 out_tensor_cpu(tensor_range); + out_tensor_cpu.setRandom(); + + TensorI64 sub_tensor(slice_range); + sub_tensor.setRandom(); + + DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType))); + DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType))); + TensorMI64 out_gpu(out_gpu_data, tensor_range); + TensorMI64 sub_gpu(sub_gpu_data, slice_range); + + sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType)); + sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType)); + + Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}}; + Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}}; + TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions())); + TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions())); + TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions())); + TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions())); + + out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32; + + out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32; + + sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType)); + int has_err = 0; + for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) { + auto exp = out_tensor_cpu(i); + auto val = out_tensor_gpu(i); + if (val != exp) { + std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl; + has_err = 1; + } + } + sycl_device.deallocate(out_gpu_data); + sycl_device.deallocate(sub_gpu_data); + return has_err; +} + template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); @@ -239,6 +374,9 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device); + run_eigen<float, RowMajor, long, int>(sycl_device); } EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) { diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp new file mode 100644 index 000000000..6c83894a3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random_sycl.cpp @@ -0,0 +1,100 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +template <typename DataType, int DataLayout, typename IndexType> +static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device) +{ + Tensor<DataType, 2,DataLayout, IndexType> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes)); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange); + + gpu_out.device(sycl_device)=gpu_out.random(); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i<sizeDim0; i++) + for(IndexType j=1; j<sizeDim1; j++) + { + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1)); } + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness + sycl_device.deallocate(d_out); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device) +{ + Tensor<DataType, 2,DataLayout,IndexType> out(72,97); + out.setZero(); + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes)); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange); + Eigen::internal::NormalRandomGenerator<DataType> gen(true); + gpu_out.device(sycl_device)=gpu_out.random(gen); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i<sizeDim0; i++) + for(IndexType j=1; j<sizeDim1; j++) + { + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1)); + + } + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness + sycl_device.deallocate(d_out); +} + +template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device); + +} +EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_random_test_per_device<float>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_random_test_per_device<double>(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index f526299c6..a297716e4 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -16,16 +16,99 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 #include "main.h" + #include <unsupported/Eigen/CXX11/Tensor> +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 753; + const IndexType num_cols = 537; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + + array<IndexType, 2> outRange = {{1, 1}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange); + + in.setRandom(); + auto dim = DSizes<IndexType, 2>(1, 1); + full_redux = in.sum().reshape(dim); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate( + sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, + outRange); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); + sycl_device.memcpyDeviceToHost( + full_redux_gpu.data(), gpu_out_data, + (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) + << ", CPU FULL: " << full_redux(0, 0) << "\n"; + VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_sum_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.sum(); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - const IndexType num_rows = 452; - const IndexType num_cols = 765; + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); @@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device in.setRandom(); - full_redux = in.mean(); + full_redux = in.maximum(); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the max. + // As we don't include this in the reduction the result should not be 2. + in(0) = static_cast<DataType>(2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.maximum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.mean(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + array<IndexType, 1> argRange = {{num_cols}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + // red_axis[1]=1; + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + in_arg1.setRandom(); + in_arg2.setRandom(); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate( + in_arg1.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate( + in_arg2.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu( + gpu_in_arg1_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu( + gpu_in_arg2_data, tensorRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu( + gpu_out_arg_data, argRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper( + gpu_out_arg__gpu_helper_data, argRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + + // CPU VERSION + out_arg_cpu = + (in_arg1.argmax(1) == in_arg2.argmax(1)) + .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); + full_redux = (out_arg_cpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + + // GPU VERSION + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg1_data, in_arg1.data(), + (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg2_data, in_arg2.data(), + (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); + out_Argout_gpu_helper.device(sycl_device) = + (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); + out_Argout_gpu.device(sycl_device) = + (out_Argout_gpu_helper) + .select(out_Argout_gpu.constant(true), + out_Argout_gpu.constant(false)); + out_gpu.device(sycl_device) = + (out_Argout_gpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() + << '\n'; + VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_in_arg1_data); + sycl_device.deallocate(gpu_in_arg2_data); + sycl_device.deallocate(gpu_out_arg__gpu_helper_data); + sycl_device.deallocate(gpu_out_arg_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.mean(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_mean_with_odd_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + // This is a particular case which illustrates a possible problem when the + // number of local threads in a workgroup is even, but is not a power of two. + using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + // 2177 = (17 * 128) + 1 gives rise to 18 local threads. + // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. + const IndexType n_elems = 8707; + array<IndexType, 1> tensor_range = {{n_elems}}; + + data_tensor in(tensor_range); + DataType full_redux; + DataType full_redux_gpu; + TensorMap<scalar_tensor> red_cpu(&full_redux); + TensorMap<scalar_tensor> red_gpu(&full_redux_gpu); + + const DataType const_val = static_cast<DataType>(0.6391); + in = in.constant(const_val); + + Eigen::IndexList<Eigen::type2index<0>> red_axis; + red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + VERIFY_IS_APPROX(const_val, red_cpu()); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = + in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu, full_redux); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_min_sycl( + const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; @@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) full_redux = in.minimum(); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } - template <typename DataType, int DataLayout, typename IndexType> -static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_min_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the min. + // As we don't include this in the reduction the result should not be -2. + in(0) = static_cast<DataType>(-2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.minimum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; @@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev in.setRandom(); - redux= in.maximum(red_axis); + redux = in.maximum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> reduced_range = {{num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = num_cols; + + data_tensor in(tensor_range); + reduced_tensor redux; + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } + + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + redux = in_offset.maximum(red_axis); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(redux(i), in(i)); + } + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate(n_reduced * sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, + n_reduced * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), redux(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> full_reduced_range = {{num_rows}}; + array<IndexType, 1> reduced_range = {{num_rows - 1}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = reduced_range[0]; + + data_tensor in(tensor_range); + reduced_tensor redux(full_reduced_range); + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + redux.setZero(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 1; + + const IndexType offset = 64; + // Introduce an offset in both the input and the output. + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range); + red_offset = in_offset.maximum(red_axis); + + // Check that the first value hasn't been changed and that the reduced values + // are not equal to the previously set maximum in the input outside the range. + VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); + } - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), + n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { + array<IndexType, 2> tensorRange = {{dim_x, dim_y}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 1> reduced_tensorRange = {{dim_y}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + redux = in.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < redux.size(); i++) { + VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); + } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> -static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_first_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 2; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - IndexType dim_x = 567; + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; IndexType dim_y = 1; - IndexType dim_z = 47; + IndexType dim_z = 32; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; @@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi in.setRandom(); - redux= in.sum(red_axis); + redux = in.sum(red_axis); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + // auto red_axis = Sizes<0,1>(0,1); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + + redux_fix = in_fix.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); } -template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + redux_fix = in_fix.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + sycl_device.synchronize(); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +// SYCL supports a generic case of reduction where the accumulator is a +// different type than the input data This is an example on how to get if a +// Tensor contains nan and/or inf in one reduction +template <typename InT, typename OutT> +struct CustomReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + static constexpr OutT InfBit = 1; + static constexpr OutT NanBit = 2; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, + OutT* accum) const { + if (Eigen::numext::isinf(x)) + *accum |= InfBit; + else if (Eigen::numext::isnan(x)) + *accum |= NanBit; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, + OutT* accum) const { + *accum |= x; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { + return OutT(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { + return accum; + } +}; + +template <typename DataType, typename AccumType, int DataLayout, + typename IndexType> +static void test_full_reductions_custom_sycl( + const Eigen::SyclDevice& sycl_device) { + constexpr IndexType InSize = 64; + auto tensorRange = Sizes<InSize>(InSize); + Eigen::IndexList<Eigen::type2index<0>> dims; + auto reduced_tensorRange = Sizes<>(); + TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix; + TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix; + + CustomReducer<DataType, AccumType> reducer; + + in_fix.setRandom(); + + size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(in_size_bytes)); + AccumType* gpu_out_data = + static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType))); + + TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); + out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); + sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, + sizeof(AccumType)); + VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_full_per_device(const Dev& sycl_device) { + test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + + test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>( + sycl_device); + test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { + test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { + test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 129, 8); test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + sycl_device.synchronize(); } + EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device<float>(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device)); + CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device)); + CALL_SUBTEST_3( + sycl_reduction_test_first_dim_per_device<float>(sycl_device)); + CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 77c2235d1..dd30c235d 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -20,10 +20,8 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> - template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = true; dim_rev[3] = false; - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + reversed_tensor.dimensions().TotalSize() * sizeof(DataType))); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, + tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(i, 2 - j, 4 - k, l)); } } } @@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[3] = false; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l)); } } } @@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = false; dim_rev[3] = true; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(1 - i, j, k, 6 - l)); } } } @@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_out_data); } - - template <typename DataType, int DataLayout, typename IndexType> -static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) -{ +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, + bool LValue) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dim_rev[2] = false; dim_rev[3] = true; - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate( + expected.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_result = static_cast<DataType*>( + sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType))); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected( + gpu_out_data_expected, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result( + gpu_out_data_result, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); if (LValue) { out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; } else { out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); } - sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - + sycl_device.memcpyDeviceToHost( + expected.data(), gpu_out_data_expected, + expected.dimensions().TotalSize() * sizeof(DataType)); array<IndexType, 4> src_slice_dim; src_slice_dim[0] = 2; @@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue for (IndexType i = 0; i < 5; ++i) { if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(src_slice_start, src_slice_dim); + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); } else { out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); @@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue src_slice_start[2] += 1; dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } @@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dst_slice_start[2] = 0; result.setRandom(); - sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_out_data_result, result.data(), + (result.dimensions().TotalSize()) * sizeof(DataType)); for (IndexType i = 0; i < 5; ++i) { - if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(dst_slice_start, dst_slice_dim); - } else { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = - in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); - } + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } } } - - -template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; +template <typename DataType> +void sycl_reverse_test_per_device(const cl::sycl::device& d) { QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device); @@ -215,7 +239,15 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true); } EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reverse_test_per_device<float>(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.get_info<cl::sycl::info::device::name>() << std::endl; + CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device)); + CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device)); + CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device)); +#endif + CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device)); } } diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp new file mode 100644 index 000000000..09c45fce5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp @@ -0,0 +1,141 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, int consume_dim, + bool exclusive) { + static const DataType error_threshold = 1e-4f; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size + << " consume_dim : " << consume_dim << ")" << std::endl; + Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size); + Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size); + Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size, + n_size); + + t_input.setRandom(); + std::size_t t_input_bytes = t_input.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType* gpu_data_in = + static_cast<DataType*>(sycl_device.allocate(t_input_bytes)); + DataType* gpu_data_out = + static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + + array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}}; + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input( + gpu_data_in, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result( + gpu_data_out, tensorRange); + sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes); + sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive); + + t_result = t_input.cumsum(consume_dim, exclusive); + + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out, + t_result_bytes); + sycl_device.synchronize(); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i) + << " vs SYCL : " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(gpu_data_in); + sycl_device.deallocate(gpu_data_out); +} + +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + false); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + false); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + false); +} +EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1( + sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device)); + CALL_SUBTEST_2( + sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device)); + CALL_SUBTEST_3( + sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device)); + CALL_SUBTEST_4( + sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device)); + CALL_SUBTEST_5( + sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device)); + CALL_SUBTEST_6( + sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index 0e8cc3bd2..ca4e8b5ef 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -12,14 +12,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL - #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -29,33 +27,33 @@ using Eigen::Tensor; using Eigen::TensorMap; template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) -{ +static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { IndexType sizeDim1 = 2; IndexType sizeDim2 = 3; IndexType sizeDim3 = 5; IndexType sizeDim4 = 7; array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange); tensor.setRandom(); - const size_t buffSize =tensor.size()*sizeof(DataType); + const size_t buffSize = tensor.size() * sizeof(DataType); array<IndexType, 4> shuffles; shuffles[0] = 0; shuffles[1] = 1; shuffles[2] = 2; shuffles[3] = 3; - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize)); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2, + tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); - gpu2.device(sycl_device)=gpu1.shuffle(shuffles); + gpu2.device(sycl_device) = gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize); sycl_device.synchronize(); @@ -68,7 +66,7 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType j = 0; j < sizeDim2; ++j) { for (IndexType k = 0; k < sizeDim3; ++k) { for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l)); } } } @@ -78,12 +76,14 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle); - - gpu3.device(sycl_device)=gpu1.shuffle(shuffles); + array<IndexType, 4> tensorrangeShuffle = { + {sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3( + gpu_data3, tensorrangeShuffle); + + gpu3.device(sycl_device) = gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); sycl_device.synchronize(); @@ -96,24 +96,22 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType j = 0; j < sizeDim2; ++j) { for (IndexType k = 0; k < sizeDim3; ++k) { for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i)); } } } } } - -template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){ +template <typename DataType, typename dev_Selector> +void sycl_shuffling_test_per_device(dev_Selector s) { QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device); test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device); - } -EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device)); } } diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 9357bed02..e6c5e2378 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -29,9 +29,9 @@ using Eigen::TensorMap; template <typename DataType, int DataLayout, typename IndexType> void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - IndexType sizeDim1 = 100; - IndexType sizeDim2 = 10; - IndexType sizeDim3 = 20; + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 5; + IndexType sizeDim3 = 1; array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); @@ -56,6 +56,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.synchronize(); for (IndexType i = 0; i < in1.size(); ++i) { + // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n"; VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); @@ -94,6 +95,88 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { } template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array<IndexType, 1> tensorRange = {{full_size}}; + tensor_type in1(tensorRange); + tensor_type out(tensorRange); + + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu1(gpu_data, tensorRange); + + in1 = in1.random(); + // Copy all data to device, then permute on copy back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + // Permute copies to device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange); + // Copy all to device, permute copies on device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + sycl_device.deallocate(gpu_data_out); + sycl_device.deallocate(gpu_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array<IndexType, 1> tensorRange = {{full_size}}; + tensor_type cpu_out(tensorRange); + tensor_type out(tensorRange); + + cpu_out.setZero(); + + std::memset(cpu_out.data(), 0, half_size * sizeof(DataType)); + std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType)); + + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu1(gpu_data, tensorRange); + + sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType)); + sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < full_size; ++i) { + VERIFY_IS_APPROX(out(i), cpu_out(i)); + } + + sycl_device.deallocate(gpu_data); +} + +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { IndexType sizeDim1 = 100; @@ -262,6 +345,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_ test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device); test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); |