diff options
Diffstat (limited to 'unsupported')
94 files changed, 4641 insertions, 612 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 39916092b..d243fe035 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -19,7 +19,7 @@ #undef isnan #undef isinf #undef isfinite -#include <SYCL/sycl.hpp> +#include <CL/sycl.hpp> #include <iostream> #include <map> #include <memory> @@ -141,6 +141,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" #include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" #include "src/Tensor/TensorSycl.h" #include "src/Tensor/TensorExecutor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 38cdb9c69..30d553af7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -83,8 +83,8 @@ large enough to hold all the data. // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. - Tensor<float, Sizes<4, 5>> t_4x3; - TensorMap<Tensor<float, 1>> t_12(t_4x3, 12); + TensorFixedSize<float, Sizes<4, 5>> t_4x3; + TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12); #### Class TensorRef @@ -272,7 +272,7 @@ Operation to a TensorFixedSize instead of a Tensor, which is a bit more efficient. // We know that the result is a 4x4x2 tensor! - TensorFixedSize<float, 4, 4, 2> result = t5; + TensorFixedSize<float, Sizes<4, 4, 2>> result = t5; Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to @@ -296,7 +296,7 @@ the expression in a temporary Tensor of the right size. The code above in effect does: // .eval() knows the size! - TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2; + TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2; Tensor<float, 3> result = (tmp * 0.2f).exp(); Note that the return value of ```eval()``` is itself an Operation, so the @@ -567,11 +567,11 @@ to the rank of the tensor. The content of the tensor is not initialized. ### TensorFixedSize -Creates a tensor of the specified size. The number of arguments in the Size<> +Creates a tensor of the specified size. The number of arguments in the Sizes<> template parameter determines the rank of the tensor. The content of the tensor is not initialized. - Eigen::TensorFixedSize<float, Size<3, 4>> a; + Eigen::TensorFixedSize<float, Sizes<3, 4>> a; cout << "Rank: " << a.rank() << endl; => Rank: 2 cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; @@ -584,11 +584,11 @@ until the TensorMap is discarded, and the size of the data must be large enough to accomodate of the coefficients of the tensor. float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap<float, 2> a(data, 3, 4); + Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4); cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; => NumRows: 3 NumCols: 4 cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 9 + => a(1, 2): 7 ## Contents Initialization @@ -1016,13 +1016,20 @@ multidimensional case. a.setValues({{1, 2}, {4, 5}, {5, 6}}); // Compute the traditional matrix product - array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) }; + Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair(1, 0) }; Eigen::Tensor<int, 2> AB = a.contract(b, product_dims); // Compute the product of the transpose of the matrices - array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) }; + Eigen::array<Eigen::IndexPair<int>, 1> transpose_product_dims = { Eigen::IndexPair(0, 1) }; Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims); - + + // Contraction to scalar value using a ouble contraction + // First coordinate of both tensors are contracted as well as both second coordinates + Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) }; + Eigen::Tensor<int, 0> AdoubleontractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoublecontractedA(0); ## Reduction Operations @@ -1168,6 +1175,58 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` in TensorFunctors.h for information on how to implement a reduction operator. +## Trace + +A *Trace* operation returns a tensor with fewer dimensions than the original +tensor. It returns a tensor whose elements are the sum of the elements of the +original tensor along the main diagonal for a list of specified dimensions, the +"trace dimensions". Similar to the ```Reduction Dimensions```, the trace dimensions +are passed as an input parameter to the operation, are of type ```<TensorType>::Dimensions``` +, and have the same requirements when passed as an input parameter. In addition, +the trace dimensions must have the same size. + +Example: Trace along 2 dimensions. + + // Create a tensor of 3 dimensions + Eigen::Tensor<int, 3> a(2, 2, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); + // Specify the dimensions along which the trace will be computed. + // In this example, the trace can only be computed along the dimensions + // with indices 0 and 1 + Eigen::array<int, 2> dims({0, 1}); + // The output tensor contains all but the trace dimensions. + Tensor<int, 1> a_trace = a.trace(dims); + cout << "a_trace:" << endl; + cout << a_trace << endl; + => + a_trace: + 11 + 13 + 15 + + +### <Operation> trace(const Dimensions& new_dims) +### <Operation> trace() + +As a special case, if no parameter is passed to the operation, trace is computed +along *all* dimensions of the input tensor. + +Example: Trace along all dimensions. + + // Create a tensor of 3 dimensions, with all dimensions having the same size. + Eigen::Tensor<int, 3> a(3, 3, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, + {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); + // Result is a zero dimension tensor + Tensor<int, 0> a_trace = a.trace(); + cout<<"a_trace:"<<endl; + cout<<a_trace<<endl; + => + a_trace: + 42 + + ## Scan Operations A *Scan* operation returns a tensor with the same dimensions as the original @@ -1314,7 +1373,7 @@ The previous example can be rewritten as follow: Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3); a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3}); - Eigen::Tensor<float, 1, Eigen::ColMajor> b; + Eigen::Tensor<float, 1, Eigen::ColMajor> b(6); b.reshape(two_dim) = a; cout << "b" << endl << b << endl; => diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index d06f40cd8..c0f33ba2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,6 +119,12 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { + return m_impl; + } +#endif + protected: TensorEvaluator<ArgType, Device> m_impl; }; @@ -172,7 +178,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, const ReduceOp& reduce_op, - const int return_dim, + const Index return_dim, const Dims& reduce_dims) : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} @@ -187,12 +193,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di const Dims& reduce_dims() const { return m_reduce_dims; } EIGEN_DEVICE_FUNC - int return_dim() const { return m_return_dim; } + Index return_dim() const { return m_return_dim; } protected: typename XprType::Nested m_xpr; const ReduceOp m_reduce_op; - const int m_return_dim; + const Index m_return_dim; const Dims m_reduce_dims; }; @@ -222,7 +228,11 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { + m_return_dim(op.return_dim()) +#ifdef EIGEN_USE_SYCL + ,m_device(device) +#endif + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast<int>(ColMajor)) { @@ -252,7 +262,16 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } + #ifndef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + #else // following functions are required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} + const Device& device() const{return m_device;} + #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -288,10 +307,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi protected: TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl; TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; +#ifdef EIGEN_USE_SYCL + const Device& m_device; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h new file mode 100644 index 000000000..442639868 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorArgMaxSycl.h + * \brief: + * TensorArgMaxSycl + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +namespace Eigen { +namespace internal { + template<typename Dims, typename XprType> + struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense> + { + typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type; + }; + + template<typename Dims, typename XprType> + struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1, + typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type> + { + typedef TensorTupleReducerDeviceOp<Dims, XprType> type; + }; + +template<typename StrideDims, typename XprType> +struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType> +{ + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + + +}// end namespace internal +template<typename StrideDims, typename XprType> +class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index; + typedef typename XprType::CoeffReturnType TupleType; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, + const Index return_dim, + const StrideDims strides, + const Index stride_mod, const Index stride_div) + :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + Index return_dim() const { return m_return_dim; } + + EIGEN_DEVICE_FUNC + const StrideDims& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const Index& stride_mod() const { return m_stride_mod; } + + EIGEN_DEVICE_FUNC + const Index& stride_div() const { return m_stride_div; } + + protected: + typename Eigen::internal::remove_all<typename + XprType::Nested + >::type m_xpr; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + + +// Eval as rvalue +template<typename StrideDims, typename ArgType> +struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice> +{ + typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::TupleType TupleType; + typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) + : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), + m_stride_div(op.stride_div()){} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } +typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); } + +protected: + TensorEvaluator<ArgType , SyclKernelDevice> m_impl; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; +} // end namespace Eigen +#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 166be200c..027305586 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -34,6 +34,7 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> > typedef typename remove_reference<RhsNested>::type _RhsNested; static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions; static const int Layout = internal::traits<LhsXprType>::Layout; + typedef typename traits<LhsXprType>::PointerType PointerType; enum { Flags = 0 @@ -168,7 +169,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> /// required by sycl in order to extract the accessor const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator<LeftArgType, Device> m_leftImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe340820..0d6331e9c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const array<Index, NumDimensions>, const Derived> argmax() const { array<Index, NumDimensions> in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, NumDimensions>, @@ -632,7 +632,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const array<Index, NumDimensions>, const Derived> argmin() const { array<Index, NumDimensions> in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, NumDimensions>, @@ -643,7 +643,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const TensorTupleReducerOp< internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, 1>, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array<Index, 1> in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -656,7 +656,7 @@ class TensorBase<Derived, ReadOnlyAccessors> const TensorTupleReducerOp< internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >, const array<Index, 1>, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array<Index, 1> in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -671,6 +671,18 @@ class TensorBase<Derived, ReadOnlyAccessors> return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer); } + template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp<const Dims, const Derived> + trace(const Dims& dims) const { + return TensorTraceOp<const Dims, const Derived>(derived(), dims); + } + + const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived> + trace() const { + DimensionList<Index, NumDimensions> in_dims; + return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims); + } + template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp<const Broadcast, const Derived> broadcast(const Broadcast& broadcast) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 23a74460e..b6c93aff9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -31,6 +31,7 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Broadcast, typename XprType> @@ -372,7 +373,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c46a778b5..21ffa2872 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -32,6 +32,7 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions - 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<DenseIndex DimId, typename XprType> @@ -264,7 +265,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) || (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) && diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 2c7ba961c..a7c1380b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,6 +37,8 @@ struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > static const int NumDimensions = traits<LhsXprType>::NumDimensions; static const int Layout = traits<LhsXprType>::Layout; enum { Flags = 0 }; + typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, + typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; }; template<typename Axis, typename LhsXprType, typename RhsXprType> @@ -275,7 +277,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy TensorOpCost(0, 0, compute_cost); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index bf4a476d9..e72ddb4a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -104,6 +104,8 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > // From NumDims below. static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; static const int Layout = traits<LhsXprType>::Layout; + typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, + typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; enum { Flags = 0 @@ -609,7 +611,7 @@ struct TensorContractionEvaluatorBase return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index c04b784a4..903bc51cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -12,7 +12,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) namespace Eigen { @@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif #define reduceRow(i, mask) \ shuffleInc(i, 0, mask); \ @@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif if((threadIdx.x%8) < 4) { rhs_pf0.y = x1; rhs_pf0.w = x2; @@ -1382,5 +1391,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } // end namespace Eigen -#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_USE_GPU and EIGEN_CUDACC #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index e87de0c57..e6840bc87 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -11,7 +11,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclConvertToDeviceExpression.h + * TensorTensorContractionsycl.h * * \brief: * TensorContractionsycl @@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { + if (data) { evalTo(data); return false; } else { @@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; + size_t out_offset; Index roundUpK, M, N, K; ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; @@ -182,18 +183,19 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS Device dev; - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), + out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - void operator()(cl::sycl::nd_item<1> itemID) { + void operator()(cl::sycl::nd_item<2> itemID) { typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr; typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr; @@ -230,13 +232,13 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS const Index nGroupId = itemID.get_group(1); // Work-group ID localCol const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + LhsScalar privateLhs; + RhsScalar privateRhs[WorkLoadPerThreadN]; + OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] = 0.0f; + privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0); } } @@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; if(globalCol<N) - out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; + out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN]; } } } @@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo // extract lhs functor list LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl()); Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); - + ptrdiff_t out_offset = self.device().get_offset(buffer); self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { /// work-around for gcc bug typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; @@ -379,18 +381,17 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor; //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - + OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer); // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, + localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index b29968b63..182bef918 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,6 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> > static const int NumDimensions = traits<XprType>::NumDimensions; static const int Layout = traits<XprType>::Layout; enum { Flags = 0 }; + typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType; }; template<typename TargetType, typename XprType> @@ -244,7 +245,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> } } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 378f5cccb..84d5be173 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -231,6 +231,8 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > typedef typename remove_reference<RhsNested>::type _RhsNested; static const int NumDimensions = traits<InputXprType>::NumDimensions; static const int Layout = traits<InputXprType>::Layout; + typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val, + typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType; enum { Flags = 0 @@ -465,7 +467,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr PacketSize)); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -551,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr // Use an optimized implementation of the evaluation code for GPUs whenever possible. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template <int StaticKernelSize> struct GetKernelSize { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 4247c1c4a..da88bcb3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -32,19 +32,20 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize, range_x, range_y; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -89,19 +90,20 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } } }; @@ -156,21 +158,22 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter } const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index] = result; + buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; } itemID.barrier(cl::sycl::access::fence_space::local_space); @@ -297,7 +300,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr /// used by sycl in order to build the sycl buffer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an @@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr m_kernel = in_place; m_local_kernel = false; } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); Scalar* local = (Scalar*)m_device.allocate(kernel_sz); typedef TensorEvalToOp<const KernelArgType> EvalTo; EvalTo evalToTmp(local, m_kernelArg); @@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; // extract input functor list InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + ptrdiff_t out_offset = m_device.get_offset(data); m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { @@ -335,8 +339,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr // create input tuple of accessors InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data); typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); @@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -421,6 +425,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } }); + m_device.asynchronousExec(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf1..b148dae39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,10 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini<double>(threads, GenericNumTraits<int>::highest()); + return numext::mini(max_threads, numext::maxi<int>(1, threads)); } // taskSize assesses parallel task size. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..0e4db46de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -30,6 +30,7 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = traits<XprType>::NumDimensions; static const int Layout = traits<XprType>::Layout; + typedef typename traits<XprType>::PointerType PointerType; }; template<typename CustomUnaryFunc, typename XprType> @@ -138,7 +139,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { @@ -180,6 +185,8 @@ struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > typedef typename remove_reference<RhsNested>::type _RhsNested; static const int NumDimensions = traits<LhsXprType>::NumDimensions; static const int Layout = traits<LhsXprType>::Layout; + typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, + typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType; }; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> @@ -293,7 +300,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + EIGEN_DEVICE_FUNC typename internal::traits<XprType>::PointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index be8d69386..ded7129da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -211,7 +211,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) @@ -239,7 +239,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); @@ -265,7 +265,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) cudaError_t err = cudaStreamSynchronize(stream_->stream()); if (err != cudaSuccess) { std::cerr << "Error detected in CUDA stream: " @@ -304,7 +304,7 @@ struct GpuDevice { // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC cudaError_t error = cudaStreamQuery(stream_->stream()); return (error == cudaSuccess) || (error == cudaErrorNotReady); #else @@ -323,9 +323,9 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6cb2..341889e88 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,7 +35,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running on the host CPU return 1; #else @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,13 +65,13 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; #else // Running on a CUDA device - return __CUDA_ARCH__ / 100; + return EIGEN_CUDA_ARCH / 100; #endif } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799bb..6158acbd9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,10 +14,41 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +template<size_t Align> struct CheckAlignStatically { + static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); +}; +template <bool IsAligned, size_t Align> +struct Conditional_Allocate { + + EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) { + return aligned_alloc(Align, elements); + } +}; +template <size_t Align> +struct Conditional_Allocate<false, Align> { + + EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ + return malloc(elements); + } +}; +template <typename Scalar, size_t Align = EIGEN_MAX_ALIGN_BYTES, class Allocator = std::allocator<Scalar>> +struct SyclAllocator { + typedef Scalar value_type; + typedef typename std::allocator_traits<Allocator>::pointer pointer; + typedef typename std::allocator_traits<Allocator>::size_type size_type; + + SyclAllocator( ){}; + Scalar* allocate(std::size_t elements) { + return static_cast<Scalar*>(Conditional_Allocate<CheckAlignStatically<Align>::Val, Align>::conditional_allocate(elements)); + } + void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } +}; namespace Eigen { - #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) +#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast<Scalar*>(static_cast<void*>(((buf_acc.get_pointer().get())))) +#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) + template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { public: @@ -40,47 +71,58 @@ namespace Eigen { size_t m_offset; }; +template<typename AccType> struct memsetkernelFunctor{ - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; AccType m_acc; + const ptrdiff_t buff_offset; const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} void operator()(cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; + if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; } }; +struct memsetCghFunctor{ + cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& m_buf; + const ptrdiff_t& buff_offset; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c)); + } +}; + +//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ - auto devices = cl::sycl::device::get_devices(); - std::vector<cl::sycl::device>::iterator it =devices.begin(); - while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) - auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); - } - else{ - ++it; +std::vector<cl::sycl::device> supported_devices; +auto plafrom_list =cl::sycl::platform::get_platforms(); +for(const auto& platform : plafrom_list){ + auto device_list = platform.get_devices(); + auto platform_name =platform.template get_info<cl::sycl::info::platform::name>(); + std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); + for(const auto& device : device_list){ + auto vendor = device.template get_info<cl::sycl::info::device::vendor>(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) || + (device.is_gpu() && platform_name.find("intel")!=std::string::npos); + if(!unsuported_condition){ + std::cout << "Platform name "<< platform_name << std::endl; + supported_devices.push_back(device); } } - return devices; +} +return supported_devices; } -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; +class QueueInterface { +public: /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s): @@ -116,11 +158,11 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); + std::lock_guard<std::mutex> lock(mutex_); + auto buf = cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); buf.set_final_data(nullptr); - std::lock_guard<std::mutex> lock(mutex_); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); + buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >(static_cast<const uint8_t*>(ptr),buf)); return static_cast<void*>(ptr); } @@ -138,62 +180,113 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard<std::mutex> lock(mutex_); buffer_map.clear(); } - - EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); + /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device + /// pointer created as a key we find the sycl buffer and get the host accessor with write mode + /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that + /// this buffer is accessed, the data will be copied to the device. + /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark + /// Also, this is faster as it uses the map_allocator instead of memcpy + template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + auto it =find_buffer(dst); + auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0)); + }); + synchronize(); } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - m_queue.wait_and_throw(); - } - return !exception_caught_; + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl + /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the + /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination + /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data + /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back + /// to the cpu only once per function call. + template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + auto it =find_buffer(src); + auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); } - // destructor - ~QueueInterface() { buffer_map.clear(); } -}; + /// the memcpy function + template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + auto it1 = find_buffer(static_cast<const void*>(src)); + auto it2 = find_buffer(dst); + auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; + auto i= (static_cast<const uint8_t*>(dst)) - it2->first; + offset/=sizeof(Index); + i/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); + auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); + }); + synchronize(); + } -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + size_t rng, GRange, tileSize; + parallel_for_setup(n, tileSize, rng, GRange); + auto it1 = find_buffer(static_cast<const void*>(data)); + ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first; + m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); + synchronize(); + } /// Creation of sycl accessor for a buffer. This function first tries to find /// the buffer in the buffer_map. If found it gets the accessor from it, if not, /// the function then adds an entry by creating a sycl buffer for that particular pointer. template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); + return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { + return find_buffer(ptr)->second; + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first); + } + + EIGEN_STRONG_INLINE void synchronize() const { + m_queue.wait_and_throw(); //pass + } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled + m_queue.wait_and_throw(); //pass } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); - auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); + tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); + auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); } rng = n; @@ -210,7 +303,7 @@ struct SyclDevice { template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); @@ -234,13 +327,11 @@ struct SyclDevice { } } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); } Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); @@ -273,6 +364,108 @@ struct SyclDevice { if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); } } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); + } + + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + m_queue.wait_and_throw(); + } + return !exception_caught_; + } + + // destructor + ~QueueInterface() { buffer_map.clear(); } + +private: + /// class members: + bool exception_caught_ = false; + + mutable std::mutex mutex_; + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > > buffer_map; + /// sycl queue + mutable cl::sycl::queue m_queue; + + EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator find_buffer(const void* ptr) const { + std::lock_guard<std::mutex> lock(mutex_); + auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } +}; + +// Here is a sycl deviuce struct which accept the sycl queue interface +// as an input +struct SyclDevice { + // class member. + QueueInterface* m_queue_stream; + /// QueueInterface is not owned. it is the caller's responsibility to destroy it. + explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} + + // get sycl accessor + template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + return m_queue_stream->template get_sycl_accessor<AcMd>(cgh, ptr); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1, SyclAllocator<uint8_t, EIGEN_MAX_ALIGN_BYTES> >& get_sycl_buffer(const void * ptr) const { + return m_queue_stream->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); + + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -287,78 +480,27 @@ struct SyclDevice { /// the memcpy function template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; - auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + m_queue_stream->memcpy(dst,src,n); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return m_queue_stream->get_offset(ptr); + + } +// memcpyHostToDevice template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); - ::memcpy(host_acc.get_pointer(), src, n); + m_queue_stream->memcpyHostToDevice(dst,src,n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. +/// here is the memcpyDeviceToHost template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); + m_queue_stream->memcpyDeviceToHost(dst,src,n); } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); - synchronize(); + m_queue_stream->memset(data,c,n); } - - struct memsetCghFunctor{ - cl::sycl::buffer<uint8_t, 1>& m_buf; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); - } - }; + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME @@ -367,39 +509,32 @@ struct SyclDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on sycl devices. return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); - // return stream_->deviceProperties().multiProcessorCount; + return m_queue_stream->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); - - // return stream_->deviceProperties().maxThreadsPerBlock; + return m_queue_stream->maxSyclThreadsPerBlock(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + return m_queue_stream->maxSyclThreadsPerMultiProcessor(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); - // return stream_->deviceProperties().sharedMemPerBlock; + return m_queue_stream->sharedMemPerBlock(); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass + m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - + m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -407,8 +542,10 @@ struct SyclDevice { return m_queue_stream->ok(); } }; - - +// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. +// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator +// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation +struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 16180ca69..ec6802e85 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -154,7 +154,11 @@ struct ThreadPoolDevice { template <class Function, class... Args> EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(f, args...)); + } else { + pool_->Schedule(f); + } } // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 82dd1e640..d0c027890 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -32,6 +32,7 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> > typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename MakePointer_<Scalar>::Type PointerType; enum { Flags = 0 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d6415817b..2264be391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -131,7 +131,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); @@ -193,7 +193,12 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifndef __SYCL_DEVICE_ONLY__ return loadConstant(m_data+index); +#else + CoeffReturnType tmp = m_data[index]; + return tmp; +#endif } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -278,7 +283,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> internal::unpacket_traits<PacketReturnType>::size); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } @@ -348,7 +353,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; } @@ -428,7 +433,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor @@ -528,7 +533,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; } @@ -620,7 +625,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f01d77c0a..0ffe68ab3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> { }; -#if defined(__CUDACC__) +#if defined(EIGEN_CUDACC) template <typename Evaluator, typename Index, bool Vectorizable> struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( evaluator.cleanup(); } -#endif // __CUDACC__ +#endif // EIGEN_CUDACC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 85dfc7a69..4b6540c07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -38,7 +38,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; @@ -89,6 +89,7 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType; }; template<typename UnaryOp, typename XprType> @@ -161,7 +162,11 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > typedef typename remove_reference<RhsNested>::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion<Scalar, + typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val, + typename traits<LhsXprType>::PointerType, + typename traits<RhsXprType>::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -238,7 +243,11 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT typedef typename remove_reference<Arg3Nested>::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename TypeConversion<Scalar, + typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val, + typename traits<Arg2XprType>::PointerType, + typename traits<Arg3XprType>::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -314,6 +323,9 @@ struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val, + typename traits<ThenXprType>::PointerType, + typename traits<ElseXprType>::PointerType>::type PointerType; }; template<typename IfXprType, typename ThenXprType, typename ElseXprType> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index f060191ab..10e0a8a6b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -71,6 +71,7 @@ struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename traits<XprType>::PointerType PointerType; }; template <typename FFT, typename XprType, int FFTResultType, int FFTDirection> @@ -234,7 +235,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D if (line_len > 1) { const RealScalar pi_over_len(EIGEN_PI / line_len); const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); + std::cos(pi_over_len), std::sin(pi_over_len)); pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index fcee5f60d..e943757ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -20,7 +20,7 @@ namespace Eigen { * The fixed sized equivalent of * Eigen::Tensor<float, 3> t(3, 5, 7); * is - * Eigen::TensorFixedSize<float, Size<3,5,7>> t; + * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t; */ template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index abe85c860..c015ce196 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -38,6 +38,7 @@ struct traits<TensorForcedEvalOp<XprType> > typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 @@ -143,7 +144,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 2e638992a..354bbe8d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -22,6 +22,22 @@ template<typename T> struct MakePointer { typedef T* Type; typedef T& RefType; }; + +namespace internal{ +template<typename A, typename B> struct Pointer_type_promotion { + static const bool val=false; +}; +template<typename A> struct Pointer_type_promotion<A, A> { + static const bool val = true; +}; +template<typename A, typename B> struct TypeConversion; +#ifndef __SYCL_DEVICE_ONLY__ +template<typename A, typename B> struct TypeConversion{ + typedef A* type; +}; +#endif +} + #if defined(EIGEN_USE_SYCL) namespace TensorSycl { namespace internal{ @@ -70,6 +86,7 @@ template<typename Strides, typename XprType> class TensorInflationOp; template<typename Generator, typename XprType> class TensorGeneratorOp; template<typename LeftXprType, typename RightXprType> class TensorAssignOp; template<typename Op, typename XprType> class TensorScanOp; +template<typename Dims, typename XprType> class TensorTraceOp; template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3b4f8eda1..5dcc3794c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -166,7 +166,8 @@ template <typename T> struct MeanReducer return pset1<Packet>(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; + internal::scalar_quotient_op<T> quotient_op; + return quotient_op(accum, T(scalarCount_)); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { @@ -175,7 +176,10 @@ template <typename T> struct MeanReducer template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { internal::scalar_sum_op<T> sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size); + internal::scalar_quotient_op<T> quotient_op; + return quotient_op( + sum_op(saccum, predux(vaccum)), + T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size)); } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index eb1d4934e..fa269b8c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -31,6 +31,7 @@ struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Generator, typename XprType> @@ -98,9 +99,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_generator(op.generator()) +#ifdef EIGEN_USE_SYCL + , m_argImpl(op.expression(), device) +#endif { - TensorEvaluator<ArgType, Device> impl(op.expression(), device); - m_dimensions = impl.dimensions(); + TensorEvaluator<ArgType, Device> argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { m_strides[0] = 1; @@ -153,7 +157,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> TensorOpCost::MulCost<Scalar>()); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -178,6 +187,9 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> Dimensions m_dimensions; array<Index, NumDims> m_strides; Generator m_generator; +#ifdef EIGEN_USE_SYCL + TensorEvaluator<ArgType, Device> m_argImpl; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed2..3c6a2e091 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -27,6 +27,7 @@ namespace Eigen { * patch_cols, and 1 for all the additional dimensions. */ namespace internal { + template<DenseIndex Rows, DenseIndex Cols, typename XprType> struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> { @@ -38,6 +39,7 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<DenseIndex Rows, DenseIndex Cols, typename XprType> @@ -70,12 +72,12 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, DenseIndex row_strides, DenseIndex col_strides, @@ -84,13 +86,31 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, DenseIndex padding_right, Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + +#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + bool padding_explicit, DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, PaddingType padding_type, + Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(padding_explicit), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(padding_type), m_padding_value(padding_value) {} + +#endif EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } @@ -171,8 +191,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + #ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) + #else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + #endif : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -241,6 +268,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warnig; + m_outputRows=0; //// silence the uninitialised warnig; } } eigen_assert(m_outputRows > 0); @@ -418,9 +447,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // Required by SYCL in order to construct the expression tree on the device + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -501,6 +535,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Scalar m_paddingValue; TensorEvaluator<ArgType, Device> m_impl; + #ifdef EIGEN_USE_SYCL + // Required for SYCL in order to construct the expression tree on the device + XprType m_op; + #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9ee..6147fbdf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -31,6 +31,7 @@ struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Strides, typename XprType> @@ -213,7 +214,12 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } +#endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ef1c9c42c..fb6454623 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -35,7 +35,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clz(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -53,7 +53,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clzll(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -90,7 +90,7 @@ namespace { template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umulhi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast<uint32_t>(b)); @@ -101,7 +101,7 @@ namespace { template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umul64hi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); @@ -124,7 +124,7 @@ namespace { template <typename T> struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return (__umulhi(magic, n) >> shift); #elif defined(__SYCL_DEVICE_ONLY__) return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index cd0109ef4..4e384f9b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -46,6 +46,7 @@ struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = traits<XprType>::NumDimensions; static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; }; template<typename XprType> @@ -159,7 +160,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_impl.data(); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index f92e39d69..c9e61f359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d55..5431eb740 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) template <> struct PacketType<half, GpuDevice> { typedef half2 type; @@ -124,7 +124,9 @@ template <typename U, typename V> struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { + #ifndef __SYCL_DEVICE_ONLY__ if (&rhs == this) return *this; + #endif first = rhs.first; second = rhs.second; return *this; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddd2ca18..329655817 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -31,6 +31,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = array_size<NewDimensions>::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename NewDimensions, typename XprType> @@ -146,7 +147,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -214,6 +215,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = array_size<StartIndices>::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename StartIndices, typename Sizes, typename XprType> @@ -468,7 +470,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { Scalar* result = m_impl.data(); if (result) { Index offset = 0; @@ -633,6 +635,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = array_size<StartIndices>::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> @@ -823,7 +826,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a8e255246..5956e513d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -31,6 +31,7 @@ struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprT typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename PaddingDimensions, typename XprType> @@ -198,7 +199,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device return cost; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// used by sycl EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f6..9e0a20abf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -31,6 +31,7 @@ struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename PatchDim, typename XprType> @@ -100,6 +101,9 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_patch_dims(op.patch_dims()) +#endif { Index num_patches = 1; const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -253,7 +257,12 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } +#endif protected: Dimensions m_dimensions; @@ -262,6 +271,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> array<Index, NumDims-1> m_patchStrides; TensorEvaluator<ArgType, Device> m_impl; + +#ifdef EIGEN_USE_SYCL + const PatchDim m_patch_dims; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 1655a813e..230915db2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); @@ -55,11 +55,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + *state = current * 6364136223846793005ULL + (stream << 1 | 1); // Generate the random output (using the PCG-XSH-RS scheme) return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -73,17 +73,17 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); +T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); return static_cast<T>(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) { +Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) { Eigen::half result; // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); + unsigned rnd = PCG_XSH_RS_generator(state, stream); result.x = static_cast<uint16_t>(rnd & 0x3ffu); // Set the exponent result.x |= (static_cast<uint16_t>(15) << 10); @@ -93,14 +93,14 @@ Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) { template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform<float>(uint64_t* state) { +float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); + const unsigned rnd = PCG_XSH_RS_generator(state, stream); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast<uint32_t>(127) << 23); @@ -109,7 +109,7 @@ float RandomToTypeUniform<float>(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform<double>(uint64_t* state) { +double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { typedef union { uint64_t raw; double dp; @@ -118,9 +118,9 @@ double RandomToTypeUniform<double>(uint64_t* state) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast<uint64_t>(1023) << 52); @@ -129,14 +129,14 @@ double RandomToTypeUniform<double>(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) { - return std::complex<float>(RandomToTypeUniform<float>(state), - RandomToTypeUniform<float>(state)); +std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) { + return std::complex<float>(RandomToTypeUniform<float>(state, stream), + RandomToTypeUniform<float>(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) { - return std::complex<double>(RandomToTypeUniform<double>(state), - RandomToTypeUniform<double>(state)); +std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) { + return std::complex<double>(RandomToTypeUniform<double>(state, stream), + RandomToTypeUniform<double>(state, stream)); } template <typename T> class UniformRandomGenerator { @@ -155,9 +155,7 @@ template <typename T> class UniformRandomGenerator { template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform<T>(&local_state); - m_state = local_state; + T result = RandomToTypeUniform<T>(&m_state, i); return result; } @@ -165,11 +163,9 @@ template <typename T> class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform<T>(&local_state); + values[j] = RandomToTypeUniform<T>(&m_state, i); } - m_state = local_state; return internal::pload<Packet>(values); } @@ -190,14 +186,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > { template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { +T RandomToTypeNormal(uint64_t* state, uint64_t stream) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform<T>(state); - v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5)); + u = RandomToTypeUniform<T>(state, stream); + v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -208,14 +204,14 @@ T RandomToTypeNormal(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) { - return std::complex<float>(RandomToTypeNormal<float>(state), - RandomToTypeNormal<float>(state)); +std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) { + return std::complex<float>(RandomToTypeNormal<float>(state, stream), + RandomToTypeNormal<float>(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) { - return std::complex<double>(RandomToTypeNormal<double>(state), - RandomToTypeNormal<double>(state)); +std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) { + return std::complex<double>(RandomToTypeNormal<double>(state, stream), + RandomToTypeNormal<double>(state, stream)); } @@ -234,9 +230,7 @@ template <typename T> class NormalRandomGenerator { template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal<T>(&local_state); - m_state = local_state; + T result = RandomToTypeNormal<T>(&m_state, i); return result; } @@ -244,11 +238,9 @@ template <typename T> class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal<T>(&local_state); + values[j] = RandomToTypeNormal<T>(&m_state, i); } - m_state = local_state; return internal::pload<Packet>(values); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e9b..da0ffe728 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -44,6 +44,7 @@ namespace internal { typedef typename XprType::Nested Nested; static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; template <class T> struct MakePointer { // Intermediate typedef to workaround MSVC issue. @@ -333,7 +334,7 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template <int B, int N, typename S, typename R, typename I> __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); @@ -421,7 +422,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, static const bool RunningFullReduction = (NumOutputDims==0); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) +#if defined(EIGEN_USE_SYCL) + , m_xpr_dims(op.dims()) +#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -674,14 +678,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, } } - EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} + EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; } +#if defined(EIGEN_USE_SYCL) + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + const Device& device() const { return m_device; } + const Dims& xprDims() const { return m_xpr_dims; } +#endif private: template <int, typename, typename> friend struct internal::GenericDimReducer; @@ -691,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, #ifdef EIGEN_USE_THREADS template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); @@ -778,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) @@ -791,7 +794,10 @@ static const bool RunningOnGPU = false; typename MakePointer_<CoeffReturnType>::Type m_result; const Device& m_device; - const Dims& m_xpr_dims; + +#if defined(EIGEN_USE_SYCL) + const Dims m_xpr_dims; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index edb0ab280..ebcbd6f41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // Full reducers for GPU, don't vectorize for now // Reducer function that enables multiple cuda thread to safely accumulate at the same @@ -23,7 +23,7 @@ namespace internal { // updated the content of the output address it will try again. template <typename T, typename R> __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast<unsigned int*>(output); @@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) else { assert(0 && "Wordsize not supported"); } -#else +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // EIGEN_CUDA_ARCH >= 300 } // We extend atomicExch to support extra data types @@ -98,15 +98,15 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer } } } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); -#else +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsigned int* semaphore) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; if (gridDim.x == 1) { @@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Let the last block reset the semaphore atomicInc(semaphore, gridDim.x + 1); } -#else +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); - } else { - *scratch = reducer.template initializePacket<half2>(); + if (gridDim.x == 1) { + if (first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } } __syncthreads(); } @@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + #else + int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(scratch, accum, reducer); } - __syncthreads(); - - if (gridDim.x == 1 && first_index == 0) { - half tmp = __low2half(*scratch); - reducer.reduce(__high2half(*scratch), &tmp); - *output = tmp; + if (gridDim.x == 1) { + __syncthreads(); + if (first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } } } @@ -268,7 +280,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 *output = tmp; } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct FullReductionLauncher { @@ -335,7 +347,7 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> { } } }; -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename Self, typename Op, bool Vectorizable> @@ -348,11 +360,11 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { (internal::is_same<typename Self::CoeffReturnType, float>::value || internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); -#else +#else // EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || internal::is_same<typename Self::CoeffReturnType, double>::value); -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename OutputType> static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { @@ -372,7 +384,7 @@ template <int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); @@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } -#else +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // EIGEN_CUDA_ARCH >= 300 } #ifdef EIGEN_HAS_CUDA_FP16 @@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + #else + int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize); + int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp1), &reduced_val1); + reducer.reducePacket(*(half2*)(&temp2), &reduced_val2); + #endif } half val1 = __low2half(reduced_val1); @@ -533,7 +556,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, } } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct InnerReductionLauncher { @@ -625,7 +648,7 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> { return false; } }; -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename Self, typename Op> @@ -638,11 +661,11 @@ struct InnerReducer<Self, Op, GpuDevice> { (internal::is_same<typename Self::CoeffReturnType, float>::value || internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); -#else +#else // EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || internal::is_same<typename Self::CoeffReturnType, double>::value); -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <typename OutputType> static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { @@ -740,7 +763,7 @@ struct OuterReducer<Self, Op, GpuDevice> { } }; -#endif +#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__) } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129e2..94899252b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,15 +27,15 @@ namespace internal { template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); + auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; @@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ template<typename BufferTOut, typename BufferTIn> -static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), - bufOut, bufI, dev, length, local); + bufOut, out_offset, bufI, dev, length, local); } }; @@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); + ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -157,11 +158,12 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); + auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output); + ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index e430b0826..14a50a029 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -31,6 +31,7 @@ struct traits<TensorReverseOp<ReverseDimensions, typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename ReverseDimensions, typename XprType> @@ -222,7 +223,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 8501466ce..1f545ef1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -24,6 +24,7 @@ struct traits<TensorScanOp<Op, XprType> > typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Op, typename XprType> @@ -175,7 +176,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_output; } @@ -241,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -280,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> { LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); } }; -#endif // EIGEN_USE_GPU && __CUDACC__ +#endif // EIGEN_USE_GPU && EIGEN_CUDACC } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index edc9dd3f3..0697fd1ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -31,6 +31,7 @@ struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Shuffle, typename XprType> @@ -185,7 +186,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } // required by sycl EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2237140e7..a7eea99b6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -31,6 +31,7 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template<typename Strides, typename XprType> @@ -222,7 +223,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4c1..7b8bd2df7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -32,12 +32,28 @@ struct MakeLocalPointer { namespace Eigen { -namespace TensorSycl { + template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp; + template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>; namespace internal { - template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; +#ifdef __SYCL_DEVICE_ONLY__ +template<typename A, typename B> struct TypeConversion { + template<typename T> + static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p); + template<typename T> + static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p); + + template<typename T> + static A* get_address_space_pointer(T* p); + typedef decltype(get_address_space_pointer(B())) type; +}; +#endif +} +namespace TensorSycl { +namespace internal { + template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; @@ -48,6 +64,13 @@ template<typename T> struct GetType<false, T>{ typedef T Type; }; +template <bool Conds, size_t X , size_t Y > struct ValueCondition { + static constexpr size_t Res =X; +}; +template<size_t X, size_t Y> struct ValueCondition<false, X, Y> { + static constexpr size_t Res =Y; +}; + } } } @@ -80,6 +103,9 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +// TensorArgMaxSycl.h +#include "TensorArgMaxSycl.h" + /// this is used for extracting tensor convolution #include "TensorConvolutionSycl.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9c2..d6ac7b91f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -91,27 +91,37 @@ ASSIGNCONVERT(, false) #undef ASSIGNCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is either TensorForcedEvalOp or TensorEvalToOp +/// type is TensorEvalToOp #define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ + +KERNELBROKERCONVERT(const, true, TensorEvalToOp) +KERNELBROKERCONVERT(, false, TensorEvalToOp) +#undef KERNELBROKERCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ template <typename Expr>\ -struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\ +struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ + typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ }; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL +// TensorForcedEvalOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT +// TensorLayoutSwapOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) + +//TensorIndexTupleOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ @@ -124,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ +template <typename OP, typename Dim, typename subExpr>\ +struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\ + typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\ +}; + +KERNELBROKERCONVERTTUPLEREDUCTION(const) +KERNELBROKERCONVERTTUPLEREDUCTION() +#undef KERNELBROKERCONVERTTUPLEREDUCTION + +//TensorSlicingOp #define KERNELBROKERCONVERTSLICEOP(CVQual)\ template<typename StartIndices, typename Sizes, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ @@ -134,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const) KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP - +//TensorStridingSlicingOp #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ @@ -145,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const) KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ template <DenseIndex DimId, typename Expr>\ @@ -156,7 +177,26 @@ KERNELBROKERCONVERTCHIPPINGOP(const) KERNELBROKERCONVERTCHIPPINGOP() #undef KERNELBROKERCONVERTCHIPPINGOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp +#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ +}; +KERNELBROKERCONVERTIMAGEPATCHOP(const) +KERNELBROKERCONVERTIMAGEPATCHOP() +#undef KERNELBROKERCONVERTIMAGEPATCHOP + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp +#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\ + typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\ +}; +KERNELBROKERCONVERTVOLUMEPATCHOP(const) +KERNELBROKERCONVERTVOLUMEPATCHOP() +#undef KERNELBROKERCONVERTVOLUMEPATCHOP } // namespace internal } // namespace TensorSycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d2c..cbae4ea1d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -65,7 +65,6 @@ CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP @@ -83,6 +82,7 @@ CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Option ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ }; + TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE() #undef TENSORMAPFIXEDSIZE @@ -189,9 +189,6 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN() #undef ASSIGN - - - /// specialisation of the \ref ExprConstructor struct when the node type is /// const TensorAssignOp #define CONVERSIONEXPRCONST(CVQual)\ @@ -223,7 +220,7 @@ struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQua Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ + : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ }; EVALTO(const) @@ -236,8 +233,12 @@ EVALTO() template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ - TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ + typedef TensorForcedEvalOp<OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ @@ -248,19 +249,32 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL -template <bool Conds, size_t X , size_t Y > struct ValueCondition { - static const size_t Res =X; -}; -template<size_t X, size_t Y> struct ValueCondition<false, X , Y> { - static const size_t Res =Y; +#define TENSORCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\ + typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\ + typedef CVQual TensorMap<\ + Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\ + Eigen::internal::traits<XprType>::Layout, \ + MakeGlobalPointer\ + > Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; +TENSORCUSTOMUNARYOP(const) +TENSORCUSTOMUNARYOP() +#undef TENSORCUSTOMUNARYOP + /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp #define SYCLREDUCTIONEXPR(CVQual)\ template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\ CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\ - static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ + static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\ NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ Type expr;\ @@ -273,32 +287,67 @@ SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp +/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. +#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ +template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ +struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\ +CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\ + static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\ + static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\ +static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\ + typedef CVQual TensorMap<\ + Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\ + Layout,\ + MakeGlobalPointer\ + > XprType;\ + typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\ + static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\ + typedef array<Index, NumDims> StrideDims;\ + typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\ + fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ + }\ +}; + +SYCLTUPLEREDUCTIONEXPR(const) +SYCLTUPLEREDUCTIONEXPR() +#undef SYCLTUPLEREDUCTIONEXPR /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp +#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ - typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ - NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ - typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ - Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ + typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\ + static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\ + typedef CVQual TensorMap<\ + Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\ + Eigen::internal::traits<XprTyp>::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ }; -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - - - +//TensorContractionOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +//TensorConvolutionOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +//TensorCustomBinaryOp +SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) +#undef SYCLCONTRACTCONVCUSBIOPS + +//TensorSlicingOp #define SYCLSLICEOPEXPR(CVQual)\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ @@ -315,7 +364,7 @@ SYCLSLICEOPEXPR(const) SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR - +//TensorStridingSlicingOp #define SYCLSLICESTRIDEOPEXPR(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ @@ -332,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const) SYCLSLICESTRIDEOPEXPR() #undef SYCLSLICESTRIDEOPEXPR +//TensorReshapingOp and TensorShufflingOp #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -344,13 +394,15 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ }; +// TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) - +// TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST +//TensorPaddingOp #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ @@ -363,11 +415,11 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ }; +//TensorPaddingOp SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) #undef SYCLPADDINGOPEXPRCONST - // TensorChippingOp #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ @@ -385,6 +437,67 @@ SYCLTENSORCHIPPINGOPEXPR(const) SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR +// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \ + funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\ +}; + +SYCLTENSORIMAGEPATCHOPEXPR(const) +SYCLTENSORIMAGEPATCHOPEXPR() +#undef SYCLTENSORIMAGEPATCHOPEXPR + +// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \ + funcD.m_padding_type, funcD.m_padding_value ){\ + }\ +}; + +SYCLTENSORVOLUMEPATCHOPEXPR(const) +SYCLTENSORVOLUMEPATCHOPEXPR() +#undef SYCLTENSORVOLUMEPATCHOPEXPR + +// TensorLayoutSwapOp and TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ +template<typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ +}; + +//TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) + +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR /// template deduction for \ref ExprConstructor struct template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43f6..fb95af59e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -147,6 +147,30 @@ SYCLFORCEDEVALEXTACC(const) SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC +//TensorCustomUnaryOp +#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, typename Dev >\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + + +SYCLCUSTOMUNARYOPEXTACC(const) +SYCLCUSTOMUNARYOPEXTACC() +#undef SYCLCUSTOMUNARYOPEXTACC + +//TensorCustomBinaryOp +#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + +SYCLCUSTOMBINARYOPEXTACC(const) +SYCLCUSTOMBINARYOPEXTACC() +#undef SYCLCUSTOMBIBARYOPEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ @@ -161,15 +185,19 @@ SYCLEVALTOEXTACC() #undef SYCLEVALTOEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ +#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; +// TensorReductionOp +SYCLREDUCTIONEXTACC(const,TensorReductionOp) +SYCLREDUCTIONEXTACC(,TensorReductionOp) -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() +// TensorTupleReducerOp +SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) +SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) #undef SYCLREDUCTIONEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp @@ -179,14 +207,14 @@ template<typename Indices, typename LhsXprType, typename RhsXprType, typename De static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; - +//TensorContractionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) #undef SYCLCONTRACTIONCONVOLUTIONEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ @@ -225,6 +253,49 @@ SYCLTENSORCHIPPINGOPEXTACC(const) SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorImagePatchOp. +#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORIMAGEPATCHOPEXTACC(const) +SYCLTENSORIMAGEPATCHOPEXTACC() +#undef SYCLTENSORIMAGEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorVolumePatchOp. +#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORVOLUMEPATCHOPEXTACC(const) +SYCLTENSORVOLUMEPATCHOPEXTACC() +#undef SYCLTENSORVOLUMEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorLayoutSwapOp, TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ +template<typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +// TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) + +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC /// template deduction for \ref ExtractAccessor template <typename Evaluator> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee020184b..a7905706d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -33,15 +33,17 @@ namespace internal { /// re-instantiate them on the device. /// We have to pass instantiated functors to the device. // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). -template <typename Evaluator> struct FunctorExtractor{ - typedef typename Evaluator::Dimensions Dimensions; - const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - FunctorExtractor(const Evaluator& expr) - : m_dimensions(expr.dimensions()) {} +#define DEFALTACTION(Evaluator)\ +typedef typename Evaluator::Dimensions Dimensions;\ +const Dimensions m_dimensions;\ +EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ +FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} +template <typename Evaluator> struct FunctorExtractor{ + DEFALTACTION(Evaluator) }; + /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything ///TensorConversionOp #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ @@ -113,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const) SYCLEXTRFUNCTERNARY() #undef SYCLEXTRFUNCTERNARY + + +//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different +//from the UnaryCategory and it is similar to the general FunctorExtractor. +/// specialisation of TensorCustomOp +#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ +template <typename CustomUnaryFunc, typename ArgType, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; +//TensorCustomUnaryOp +SYCLEXTRFUNCCUSTOMUNARYOP(const) +SYCLEXTRFUNCCUSTOMUNARYOP() +#undef SYCLEXTRFUNCCUSTOMUNARYOP + +//TensorCustomBinaryOp +#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ +template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ + typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; +//TensorCustomBinaryOp +SYCLEXTRFUNCCUSTOMBIBARYOP(const) +SYCLEXTRFUNCCUSTOMBIBARYOP() +#undef SYCLEXTRFUNCCUSTOMBIBARYOP + + + /// specialisation of the \ref FunctorExtractor struct when the node type is /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCSELECTOP(CVQual)\ @@ -143,19 +175,26 @@ SYCLEXTRFUNCASSIGNOP(const) SYCLEXTRFUNCASSIGNOP() #undef SYCLEXTRFUNCASSIGNOP -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorEvalToOp, This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOP(CVQual)\ -template <typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\ - : rhsExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node types are +/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ +template <typename Expr, typename Dev>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ + FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\ + : xprExpr(expr.impl()) {}\ }; - -SYCLEXTRFUNCEVALTOOP(const) -SYCLEXTRFUNCEVALTOOP() -#undef SYCLEXTRFUNCEVALTOOP +//TensorEvalToOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) +// TensorLayoutSwapOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) +// TensorIndexTupleOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) + +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE template<typename Dim, size_t NumOutputDim> struct DimConstr { template<typename InDim> @@ -166,10 +205,10 @@ template<typename Dim> struct DimConstr<Dim, 0> { template<typename InDim> static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} }; - +//TensorReductionOp #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\ typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ @@ -177,12 +216,39 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgTy FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ }; - - SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +//TensorTupleReducerOp +#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ +template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\ + struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\ + typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\ + static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\ + typedef typename Evaluator::StrideDims StrideDims;\ + typedef typename Evaluator::Index Index;\ + typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ + const Dimensions m_dimensions;\ + const Index m_return_dim;\ + const StrideDims m_strides;\ + const Index m_stride_mod;\ + const Index m_stride_div;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\ + FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\ + : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ + m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ +}; + +SYCLEXTRFUNCTUPLEREDUCTIONOP(const) +SYCLEXTRFUNCTUPLEREDUCTIONOP() +#undef SYCLEXTRFUNCTUPLEREDUCTIONOP + +//TensorContractionOp and TensorConvolutionOp #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ @@ -194,9 +260,10 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, Rhs : m_dimensions(expr.dimensions()) {}\ }; - +//TensorContractionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +//TensorConvolutionOp SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP @@ -219,6 +286,7 @@ SYCLEXTRFUNCTSLICEOP(const) SYCLEXTRFUNCTSLICEOP() #undef SYCLEXTRFUNCTSLICEOP +//TensorStridingSlicingOp #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ @@ -237,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const) SYCLEXTRFUNCTSLICESTRIDEOP() #undef SYCLEXTRFUNCTSLICESTRIDEOP -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template<typename Param, typename XprType, typename Dev>\ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ @@ -248,9 +316,11 @@ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprTy : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ }; +//TensorReshapingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) +//TensorShufflingOp SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT @@ -293,7 +363,7 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) //TensorChippingOp #define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ template<DenseIndex DimId, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\ FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ const DenseIndex m_dim;\ const DenseIndex m_offset;\ @@ -307,6 +377,84 @@ SYCLEXTRFUNCCHIPPINGOP(const) SYCLEXTRFUNCCHIPPINGOP() #undef SYCLEXTRFUNCCHIPPINGOP +//TensorImagePatchOp +#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\ +typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\ +FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ +: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\ + m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\ + m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\ + m_padding_value(expr.xpr().padding_value()){}\ +}; + +SYCLEXTRFUNCIMAGEPATCHOP(const) +SYCLEXTRFUNCIMAGEPATCHOP() +#undef SYCLEXTRFUNCIMAGEPATCHOP + +/// TensorVolumePatchOp +#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\ +typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\ +FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ +const DenseIndex m_patch_planes;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_plane_strides;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_plane_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_plane_inflate_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top_z;\ +const DenseIndex m_padding_bottom_z;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\ +: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\ + m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\ + m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \ + m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\ +}; +SYCLEXTRFUNCVOLUMEPATCHOP(const) +SYCLEXTRFUNCVOLUMEPATCHOP() +#undef SYCLEXTRFUNCVOLUMEPATCHOP + + /// template deduction function for FunctorExtractor template <typename Evaluator> auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 2f7779036..e5b892f2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -21,11 +21,12 @@ namespace internal { template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ OP op; OutputAccessor aOut; + ptrdiff_t out_offset; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -59,7 +60,7 @@ namespace internal { aI[itemID.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; + aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; } } } @@ -71,9 +72,9 @@ namespace internal { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -84,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum; + output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -109,11 +111,11 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor; typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; @@ -124,8 +126,8 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast<Index>(itemID.get_global_linear_id()); @@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; + output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -170,7 +173,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); @@ -217,7 +220,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); auto scale = (rng*red_factor) + remaining; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a1c112f4d..234580c7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -93,26 +93,58 @@ SYCLFORCEDEVALLEAFCOUNT(const) SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT +#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ +template <typename CustomUnaryFunc, typename XprType>\ +struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\ +static const size_t Count =1;\ +}; + +SYCLCUSTOMUNARYOPLEAFCOUNT(const) +SYCLCUSTOMUNARYOPLEAFCOUNT() +#undef SYCLCUSTOMUNARYOPLEAFCOUNT + + +#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\ +struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\ +static const size_t Count =1;\ +}; +SYCLCUSTOMBINARYOPLEAFCOUNT( const) +SYCLCUSTOMBINARYOPLEAFCOUNT() +#undef SYCLCUSTOMBINARYOPLEAFCOUNT + /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLEAFCOUNT(CVQual)\ +#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ template <typename Expr>\ -struct LeafCount<CVQual TensorEvalToOp<Expr> > {\ - static const size_t Count = 1 + CategoryCount<Expr>::Count;\ +struct LeafCount<CVQual ExprNode<Expr> > {\ + static const size_t Count = Num + CategoryCount<Expr>::Count;\ }; -EVALTOLEAFCOUNT(const) -EVALTOLEAFCOUNT() -#undef EVALTOLEAFCOUNT +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) + +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) + +#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ +#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ template <typename OP, typename Dim, typename Expr>\ -struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\ +struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\ static const size_t Count =1;\ }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() +// TensorReductionOp +REDUCTIONLEAFCOUNT(const,TensorReductionOp) +REDUCTIONLEAFCOUNT(,TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) +REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) + #undef REDUCTIONLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp @@ -128,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) #undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType>\ @@ -139,7 +169,6 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp #define CHIPPINGOPLEAFCOUNT(CVQual)\ template <DenseIndex DimId, typename XprType>\ @@ -149,7 +178,7 @@ CHIPPINGOPLEAFCOUNT(const) CHIPPINGOPLEAFCOUNT() #undef CHIPPINGOPLEAFCOUNT - +///TensorStridingSlicingOp #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{}; @@ -158,6 +187,24 @@ SLICESTRIDEOPLEAFCOUNT(const) SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT +//TensorImagePatchOp +#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{}; + + +TENSORIMAGEPATCHOPLEAFCOUNT(const) +TENSORIMAGEPATCHOPLEAFCOUNT() +#undef TENSORIMAGEPATCHOPLEAFCOUNT + +// TensorVolumePatchOp +#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ +struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{}; + +TENSORVOLUMEPATCHOPLEAFCOUNT(const) +TENSORVOLUMEPATCHOPLEAFCOUNT() +#undef TENSORVOLUMEPATCHOPLEAFCOUNT } /// namespace TensorSycl } /// namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 74566dcee..9d5708fc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -143,17 +143,52 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMUNARYOPEVAL(CVQual)\ +template <typename CustomUnaryFunc, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\ +}; + +CUSTOMUNARYOPEVAL(const) +CUSTOMUNARYOPEVAL() +#undef CUSTOMUNARYOPEVAL + + /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp -#define EVALTO(CVQual)\ +/// TensorForcedEvalOp +#define CUSTOMBINARYOPEVAL(CVQual)\ +template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\ +}; + +CUSTOMBINARYOPEVAL(const) +CUSTOMBINARYOPEVAL() +#undef CUSTOMBINARYOPEVAL + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp +#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ template <typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\ - typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\ +struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\ + typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\ }; -EVALTO(const) -EVALTO() -#undef EVALTO +// TensorEvalToOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) +//TensorLayoutSwapOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) +//TensorIndexTupleOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) + +#undef EVALTOLAYOUTSWAPINDEXTUPLE /// specialisation of the \ref PlaceHolderExpression when the node is @@ -169,17 +204,24 @@ CHIPPINGOP() #undef CHIPPINGOP /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLREDUCTION(CVQual)\ +/// TensorReductionOp and TensorTupleReducerOp (Argmax) +#define SYCLREDUCTION(CVQual, ExprNode)\ template <typename OP, typename Dims, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\ - typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\ +struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\ + typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\ }; -SYCLREDUCTION(const) -SYCLREDUCTION() + +// tensor reduction +SYCLREDUCTION(const, TensorReductionOp) +SYCLREDUCTION(, TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +SYCLREDUCTION(const, TensorTupleReducerOp) +SYCLREDUCTION(, TensorTupleReducerOp) #undef SYCLREDUCTION + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ @@ -218,6 +260,34 @@ SYCLSLICESTRIDEOPPLH() #undef SYCLSLICESTRIDEOPPLH + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOP(CVQual)\ +template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\ + typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ +}; + +SYCLTENSORIMAGEPATCHOP(const) +SYCLTENSORIMAGEPATCHOP() +#undef SYCLTENSORIMAGEPATCHOP + + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOP(CVQual)\ +template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\ +struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\ + typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\ +}; + +SYCLTENSORVOLUMEPATCHOP(const) +SYCLTENSORVOLUMEPATCHOP() +#undef SYCLTENSORVOLUMEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template <typename Expr> struct createPlaceHolderExpression { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac785540..29c78184d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,7 +25,6 @@ namespace Eigen { namespace TensorSycl { - template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; @@ -38,7 +37,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx void operator()(cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice()); typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); if (gId < range) device_evaluator.evalScalar(gId); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h new file mode 100644 index 000000000..2b1968de1 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com> +// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H + +namespace Eigen { + +/** \class TensorTrace + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor Trace class. + * + * + */ + +namespace internal { +template<typename Dims, typename XprType> +struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename Dims, typename XprType> +struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense> +{ + typedef const TensorTraceOp<Dims, XprType>& type; +}; + +template<typename Dims, typename XprType> +struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type> +{ + typedef TensorTraceOp<Dims, XprType> type; +}; + +} // end namespace internal + + +template<typename Dims, typename XprType> +class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> > +{ + public: + typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested; + typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) + : m_xpr(expr), m_dims(dims) { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Dims m_dims; +}; + + +// Eval as rvalue +template<typename Dims, typename ArgType, typename Device> +struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> +{ + typedef TensorTraceOp<Dims, ArgType> XprType; + static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + static const int NumReducedDims = internal::array_size<Dims>::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumOutputDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_traceDim(1), m_device(device) + { + + EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + + const Dims& op_dims = op.dims(); + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op_dims[i] >= 0); + eigen_assert(op_dims[i] < NumInputDims); + m_reduced[op_dims[i]] = true; + } + + // All the dimensions should be distinct to compute the trace + int num_distinct_reduce_dims = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + ++num_distinct_reduce_dims; + } + } + + eigen_assert(num_distinct_reduce_dims == NumReducedDims); + + // Compute the dimensions of the result. + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + + int output_index = 0; + int reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedDims[reduced_index] = input_dims[i]; + if (reduced_index > 0) { + // All the trace dimensions must have the same size + eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); + } + ++reduced_index; + } + else { + m_dimensions[output_index] = input_dims[i]; + ++output_index; + } + } + + if (NumReducedDims != 0) { + m_traceDim = m_reducedDims[0]; + } + + // Compute the output strides + if (NumOutputDims > 0) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } + else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Compute the input strides + if (NumInputDims > 0) { + array<Index, NumInputDims> input_strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } + else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + output_index = 0; + reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if(m_reduced[i]) { + m_reducedStrides[reduced_index] = input_strides[i]; + ++reduced_index; + } + else { + m_preservedStrides[output_index] = input_strides[i]; + ++output_index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Initialize the result + CoeffReturnType result = internal::cast<int, CoeffReturnType>(0); + Index index_stride = 0; + for (int i = 0; i < NumReducedDims; ++i) { + index_stride += m_reducedStrides[i]; + } + + // If trace is requested along all dimensions, starting index would be 0 + Index cur_index = 0; + if (NumOutputDims != 0) + cur_index = firstInput(index); + for (Index i = 0; i < m_traceDim; ++i) { + result += m_impl.coeff(cur_index); + cur_index += index_stride; + } + + return result; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values); + return result; + } + + protected: + // Given the output index, finds the first index in the input tensor used to compute the trace + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } + else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + Dimensions m_dimensions; + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + array<bool, NumInputDims> m_reduced; + array<Index, NumReducedDims> m_reducedDims; + // Initialize the size of the trace dimension + Index m_traceDim; + array<Index, NumOutputDims> m_outputStrides; + array<Index, NumReducedDims> m_reducedStrides; + array<Index, NumOutputDims> m_preservedStrides; +}; + + +} // End namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index a1e944e59..006b37921 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -61,6 +61,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > typedef T& RefType; }; + typedef typename MakePointer<Scalar>::Type PointerType; }; @@ -81,6 +82,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > typedef T& RefType; }; + typedef typename MakePointer<Scalar>::Type PointerType; }; @@ -105,6 +107,7 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > typedef typename MakePointerT::RefType RefType; }; + typedef typename MakePointer<Scalar>::Type PointerType; }; template<typename PlainObjectType> @@ -121,6 +124,7 @@ struct traits<TensorRef<PlainObjectType> > Options = BaseTraits::Options, Flags = BaseTraits::Flags }; + typedef typename BaseTraits::PointerType PointerType; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 0ca2cac84..51c099591 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -22,6 +22,7 @@ namespace Eigen { * dimensions. */ namespace internal { + template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType> { @@ -33,6 +34,8 @@ struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits typedef typename remove_reference<Nested>::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + }; template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> @@ -65,12 +68,12 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), - m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, @@ -80,13 +83,31 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, DenseIndex padding_right, Scalar padding_value) - : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), - m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + +#ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + bool padding_explicit, DenseIndex padding_top_z, DenseIndex padding_bottom_z, + DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, + DenseIndex padding_right, PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), m_padding_explicit(padding_explicit), m_padding_top_z(padding_top_z), + m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), m_padding_left(padding_left), + m_padding_right(padding_right), m_padding_type(padding_type), m_padding_value(padding_value) {} + +#endif EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; } @@ -183,9 +204,16 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D CoordAccess = false, RawAccess = false }; +#ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -322,6 +350,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D // Fast representations of different variables. m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride); m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride); @@ -338,7 +367,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]); } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -502,10 +530,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // Required by SYCL in order to construct the expression on the device + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif + Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -600,6 +633,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D Scalar m_paddingValue; TensorEvaluator<ArgType, Device> m_impl; + +#ifdef EIGEN_USE_SYCL +// Required by SYCL in order to construct the expression on the device + XprType m_op; +#endif + }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 9dcc9dab7..1264a0270 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -24,9 +24,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) - : num_threads_(num_threads), + : env_(env), + num_threads_(num_threads), allow_spinning_(allow_spinning), - env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 573ca435a..96b3a8261 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template <typename T, size_t n> class array { diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index d2808860c..279fe5cd3 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -683,4 +683,11 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> > } +namespace std { +template <typename T> +class numeric_limits<Eigen::AutoDiffScalar<T> > + : public numeric_limits<typename T::Scalar> {}; + +} // namespace std + #endif // EIGEN_AUTODIFF_SCALAR_H diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index a5d034d71..e43cdb7fb 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -341,7 +341,7 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d) // set from a vector of Euler angles template<class System, class Other> - struct eulerangles_assign_impl<System,Other,4,1> + struct eulerangles_assign_impl<System,Other,3,1> { typedef typename Other::Scalar Scalar; static void run(EulerAngles<Scalar, System>& e, const Other& vec) diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h index d113e6e90..9b3eb53e0 100644 --- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h +++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h @@ -104,12 +104,18 @@ class IterScaling for (int i = 0; i < m; ++i) { Dr(i) = std::sqrt(Dr(i)); + } + for (int i = 0; i < n; ++i) + { Dc(i) = std::sqrt(Dc(i)); } // Save the scaling factors for (int i = 0; i < m; ++i) { m_left(i) /= Dr(i); + } + for (int i = 0; i < n; ++i) + { m_right(i) /= Dc(i); } // Scale the rows and the columns of the matrix diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index bb6d9e1fe..85ab3d97c 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -326,6 +326,7 @@ struct matrix_exp_computeUV<MatrixType, long double> } else if (l1norm < 1.125358383453143065081397882891878e+000L) { matrix_exp_pade13(arg, U, V); } else { + const long double maxnorm = 2.884233277829519311757165057717815L; frexp(l1norm / maxnorm, &squarings); if (squarings < 0) squarings = 0; MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings)); @@ -342,6 +343,27 @@ struct matrix_exp_computeUV<MatrixType, long double> } }; +template<typename T> struct is_exp_known_type : false_type {}; +template<> struct is_exp_known_type<float> : true_type {}; +template<> struct is_exp_known_type<double> : true_type {}; +#if LDBL_MANT_DIG <= 112 +template<> struct is_exp_known_type<long double> : true_type {}; +#endif + +template <typename ArgType, typename ResultType> +void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type +{ + typedef typename ArgType::PlainObject MatrixType; + MatrixType U, V; + int squarings; + matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) + MatrixType numer = U + V; + MatrixType denom = -U + V; + result = denom.partialPivLu().solve(numer); + for (int i=0; i<squarings; i++) + result *= result; // undo scaling by repeated squaring +} + /* Computes the matrix exponential * @@ -349,26 +371,13 @@ struct matrix_exp_computeUV<MatrixType, long double> * \param result variable in which result will be stored */ template <typename ArgType, typename ResultType> -void matrix_exp_compute(const ArgType& arg, ResultType &result) +void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default { typedef typename ArgType::PlainObject MatrixType; -#if LDBL_MANT_DIG > 112 // rarely happens typedef typename traits<MatrixType>::Scalar Scalar; typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename std::complex<RealScalar> ComplexScalar; - if (sizeof(RealScalar) > 14) { - result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>); - return; - } -#endif - MatrixType U, V; - int squarings; - matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - MatrixType numer = U + V; - MatrixType denom = -U + V; - result = denom.partialPivLu().solve(numer); - for (int i=0; i<squarings; i++) - result *= result; // undo scaling by repeated squaring + result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>); } } // end namespace Eigen::internal @@ -402,7 +411,7 @@ template<typename Derived> struct MatrixExponentialReturnValue inline void evalTo(ResultType& result) const { const typename internal::nested_eval<Derived, 10>::type tmp(m_src); - internal::matrix_exp_compute(tmp, result); + internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::Scalar>()); } Index rows() const { return m_src.rows(); } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index db2449d02..ef50c46a9 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -398,8 +398,8 @@ struct matrix_function_compute template <typename MatrixType> struct matrix_function_compute<MatrixType, 0> { - template <typename AtomicType, typename ResultType> - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template <typename MatA, typename AtomicType, typename ResultType> + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits<MatrixType> Traits; typedef typename Traits::Scalar Scalar; @@ -422,14 +422,14 @@ struct matrix_function_compute<MatrixType, 0> template <typename MatrixType> struct matrix_function_compute<MatrixType, 1> { - template <typename AtomicType, typename ResultType> - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template <typename MatA, typename AtomicType, typename ResultType> + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits<MatrixType> Traits; - typedef typename MatrixType::Index Index; // compute Schur decomposition of A - const ComplexSchur<MatrixType> schurOfA(A); + const ComplexSchur<MatrixType> schurOfA(A); + eigen_assert(schurOfA.info()==Success); MatrixType T = schurOfA.matrixT(); MatrixType U = schurOfA.matrixU(); @@ -514,7 +514,7 @@ template<typename Derived> class MatrixFunctionReturnValue typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType; AtomicType atomic(m_f); - internal::matrix_function_compute<NestedEvalTypeClean>::run(m_A, atomic, result); + internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 1acfbed9e..ff8f6e732 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -339,7 +339,7 @@ public: typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType; AtomicType atomic; - internal::matrix_function_compute<DerivedEvalTypeClean>::run(m_A, atomic, result); + internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 369ad97b4..5d1b8fcc2 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -121,7 +121,7 @@ template <> struct lgamma_impl<float> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -134,7 +134,7 @@ template <> struct lgamma_impl<double> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index ec4fa8448..e0e3a8be6 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma<float4>(const float4& a) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 003c9de0b..22647cadd 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,25 +152,40 @@ endif() if(EIGEN_TEST_CXX11) if(EIGEN_TEST_SYCL) - ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") + if(EIGEN_SYCL_TRISYCL) + set(CMAKE_CXX_STANDARD 14) + set(STD_CXX_FLAG "-std=c++1z") + else(EIGEN_SYCL_TRISYCL) + # It should be safe to always run these tests as there is some fallback code for + # older compiler that don't support cxx11. + set(CMAKE_CXX_STANDARD 11) + set(STD_CXX_FLAG "-std=c++11") + endif(EIGEN_SYCL_TRISYCL) + + ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) endif(EIGEN_TEST_SYCL) - # It should be safe to always run these tests as there is some fallback code for - # older compiler that don't support cxx11. - set(CMAKE_CXX_STANDARD 11) ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") @@ -212,6 +227,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_fft) ei_add_test(cxx11_tensor_ifft) ei_add_test(cxx11_tensor_scan) + ei_add_test(cxx11_tensor_trace) endif() diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 79ee72847..500fb2d17 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -278,6 +278,9 @@ void test_EulerAngles() EulerAnglesXYZd onesEd(1, 1, 1); EulerAnglesXYZf onesEf = onesEd.cast<float>(); VERIFY_IS_APPROX(onesEd, onesEf.cast<double>()); + + // Simple Construction from Vector3 test + VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones())); CALL_SUBTEST_1( eulerangles_manual<float>() ); CALL_SUBTEST_2( eulerangles_manual<double>() ); diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 4df2f5c57..9cf11280c 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -72,6 +72,20 @@ template<typename Scalar> void check_hyperbolic_functions() VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150)); } +template <typename Scalar> +void check_limits_specialization() +{ + typedef Eigen::Matrix<Scalar, 1, 1> Deriv; + typedef Eigen::AutoDiffScalar<Deriv> AD; + + typedef std::numeric_limits<AD> A; + typedef std::numeric_limits<Scalar> B; + +#if EIGEN_HAS_CXX11 + VERIFY(bool(std::is_base_of<B, A>::value)); +#endif +} + void test_autodiff_scalar() { for(int i = 0; i < g_repeat; i++) { @@ -79,5 +93,6 @@ void test_autodiff_scalar() CALL_SUBTEST_2( check_atan2<double>() ); CALL_SUBTEST_3( check_hyperbolic_functions<float>() ); CALL_SUBTEST_4( check_hyperbolic_functions<double>() ); + CALL_SUBTEST_5( check_limits_specialization<double>()); } } diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 653443dc5..3d73d491a 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp new file mode 100644 index 000000000..521a7f82c --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -0,0 +1,245 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template <typename DataType, int Layout, typename DenseIndex> +static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ + + Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}}); + Tensor<DenseIndex, 0, Layout, DenseIndex> out_max; + Tensor<DenseIndex, 0, Layout, DenseIndex> out_min; + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(1, 1, 1) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(DataType); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min); + sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes); + + gpu_out_max.device(sycl_device) = gpu_in.argmax(); + gpu_out_min.device(sycl_device) = gpu_in.argmin(); + + sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); + sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); + + VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1); + VERIFY_IS_EQUAL(out_min(), 0); + + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out_max); + sycl_device.deallocate(d_out_min); +} + + +template <typename DataType, int DataLayout, typename DenseIndex> +static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector<DenseIndex> dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array<DenseIndex, 3> out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?-1.0:10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + +template <typename DataType, int DataLayout, typename DenseIndex> +static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector<DenseIndex> dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array<DenseIndex, 3> out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?1.0:-10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + + + + +template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){ + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device); +} + +void test_cxx11_tensor_argmax_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device<double>(device)); + } + +} diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 88c233994..816e03220 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index d4e111f5d..a52350f85 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -11,9 +11,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -107,6 +104,41 @@ static void test_cuda_sum_reductions() { gpu_device.deallocate(gpu_out_ptr); } +static void test_cuda_mean_reductions() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random<int>(1024, 5*1024); + const int num_cols = internal::random<int>(1024, 5*1024); + + Tensor<std::complex<float>, 2> in(num_rows, num_cols); + in.setRandom(); + + Tensor<std::complex<float>, 0> full_redux; + full_redux = in.mean(); + + std::size_t in_bytes = in.size() * sizeof(std::complex<float>); + std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); + std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); + std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.mean(); + + Tensor<std::complex<float>, 0> full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); +} static void test_cuda_product_reductions() { @@ -149,5 +181,6 @@ void test_cxx11_tensor_complex() { CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_sum_reductions()); + CALL_SUBTEST(test_cuda_mean_reductions()); CALL_SUBTEST(test_cuda_product_reductions()); } diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 2baf5eaad..aac780905 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -11,9 +11,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index dd68430ce..3621e2aa6 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,12 +14,10 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 0ba9d52e9..9584a539f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp new file mode 100644 index 000000000..9ff287fff --- /dev/null +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +template<typename TensorType> +struct InsertZeros { + DSizes<DenseIndex, 2> dimensions(const TensorType& input) const { + DSizes<DenseIndex, 2> result; + result[0] = input.dimension(0) * 2; + result[1] = input.dimension(1) * 2; + return result; + } + + template <typename Output, typename Device> + void eval(const TensorType& input, Output& output, const Device& device) const + { + array<DenseIndex, 2> strides; + strides[0] = 2; + strides[1] = 2; + output.stride(strides).device(device) = input; + + Eigen::DSizes<DenseIndex, 2> offsets(1,1); + Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1); + output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f); + } +}; + +template<typename DataType, int DataLayout, typename IndexType> +static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 5; + Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; + Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}}; + + Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange); + Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(out.dimension(0), 6); + VERIFY_IS_EQUAL(out.dimension(1), 10); + + for (int i = 0; i < 6; i+=2) { + for (int j = 0; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2)); + } + } + for (int i = 1; i < 6; i+=2) { + for (int j = 1; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), 0); + } + } +} + +template<typename TensorType> +struct BatchMatMul { + DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const { + DSizes<DenseIndex, 3> result; + result[0] = input1.dimension(0); + result[1] = input2.dimension(1); + result[2] = input2.dimension(2); + return result; + } + + template <typename Output, typename Device> + void eval(const TensorType& input1, const TensorType& input2, + Output& output, const Device& device) const + { + typedef typename TensorType::DimensionPair DimPair; + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); + for (int64_t i = 0; i < output.dimension(2); ++i) { + output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims); + } + } +}; + +template<typename DataType, int DataLayout, typename IndexType> +static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + + Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}}; + Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}}; + Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}}; + + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange1); + TensorType gpu_in2(gpu_in2_data, tensorRange2); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + in2.setRandom(); + + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); + + gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < 5; ++i) { + typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair; + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); + Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims); + TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i); + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(val(j, k), reference(j, k)); + } + } + } +} + +template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device); + +} +void test_cxx11_tensor_custom_op_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(custom_op_perDevice<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index fde20ddf2..7c14bc187 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,12 +13,10 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> + using Eigen::Tensor; using Eigen::RowMajor; diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index aca036cde..a21514d56 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp new file mode 100644 index 000000000..f551c8d0c --- /dev/null +++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL +static const float error_threshold =1e-8f; + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +struct Generator1D { + Generator1D() { } + + float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const { + return coordinates[0]; + } +}; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_1D_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 6; + array<IndexType, 1> tensorRange = {{sizeDim1}}; + Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange); + Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =vec.size()*sizeof(DataType); + DataType* gpu_data_vec = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(result(i), i); + } +} + + +struct Generator2D { + Generator2D() { } + + float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const { + return 3 * coordinates[0] + 11 * coordinates[1]; + } +}; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_2D_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 7; + array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; + Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange); + Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 5; ++i) { + for (IndexType j = 0; j < 5; ++j) { + VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); + } + } +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType rows = 32; + IndexType cols = 48; + array<DataType, 2> means; + means[0] = rows / 2.0f; + means[1] = cols / 2.0f; + array<DataType, 2> std_devs; + std_devs[0] = 3.14f; + std_devs[1] = 2.7f; + internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs); + + array<IndexType, 2> tensorRange = {{rows, cols}}; + Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange); + Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < rows; ++i) { + for (IndexType j = 0; j < cols; ++j) { + DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f; + DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f; + DataType gaussian = expf(-g_rows - g_cols); + Eigen::internal::isApprox(result(i, j), gaussian, error_threshold); + } + } +} + +template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +void test_cxx11_tensor_generator_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_generator_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp new file mode 100644 index 000000000..eea18ec70 --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp @@ -0,0 +1,1092 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_image_patch_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template <typename DataType, typename IndexType> +static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Single pixel patch: ColMajor + array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); + + // Single pixel patch: RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index colmajor " << i << " : " + << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] + << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index row major" << i << " : " + << tensor_row_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); + + // Entire image patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); + gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); + expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(b, patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); + + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + } + if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); + + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + + } + if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); + +} + + +// Verifies VALID padding (no padding) with incrementing values. +template <typename DataType, typename IndexType> +static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 3; + IndexType input_cols = 3; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + // ColMajor + array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; + Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_result_col_major); + sycl_device.deallocate(gpu_data_result_row_major); +} + +// Verifies VALID padding (no padding) with the same value. +template <typename DataType, typename IndexType> +static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 1; + IndexType input_rows = 5; + IndexType input_cols = 5; + IndexType input_batches = 2; + IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + // ColMajor + + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; + Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + +// Verifies SAME padding. +template <typename DataType, typename IndexType> +static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 4; + IndexType input_cols = 2; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + // ColMajor + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + +array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; +Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); +size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); +DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); +TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); +gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); +sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + + +template <typename DataType, typename IndexType> +static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + + // ColMajor + array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + tensor_col_major.setRandom(); + Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); + + + // Single pixel patch: ColMajor + array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); + + // Single pixel patch: RowMajor + array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " + << tensor_col_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + + // Entire image patch: RowMajor +patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; +Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange); +patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); +DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); +TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); +gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); +sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j); + expected_row_major = tensor_row_major(c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset); + } + if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { + expected_row_major = tensor_row_major(col_offset, row_offset, d); + } + if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); +} + +template <typename DataType, typename IndexType> +static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + // ColMajor + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 128; + IndexType sizeDim3 = 128; + IndexType sizeDim4 = 16; + array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange); + l_in_col_major.setRandom(); + + DataType* gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + + array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange); + size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); + gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); + + // RowMajor + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); + + for (IndexType b = 0; b < 16; ++b) { + for (IndexType i = 0; i < 128; ++i) { + for (IndexType j = 0; j < 128; ++j) { + IndexType patchId = i+128*j; + for (IndexType c = 0; c < 11; ++c) { + for (IndexType r = 0; r < 11; ++r) { + for (IndexType d = 0; d < 3; ++d) { + DataType expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in_col_major(d, r-5+i, c-5+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != + expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), + expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 16; + sizeDim2 = 64; + sizeDim3 = 64; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + +// RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 64; ++i) { + for (IndexType j = 0; j < 64; ++j) { + IndexType patchId = i+64*j; + for (IndexType c = 0; c < 9; ++c) { + for (IndexType r = 0; r < 9; ++r) { + for (IndexType d = 0; d < 16; ++d) { + DataType expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in_col_major(d, r-4+i, c-4+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 32; + sizeDim2 = 16; + sizeDim3 = 16; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 16; ++i) { + for (IndexType j = 0; j < 16; ++j) { + IndexType patchId = i+16*j; + for (IndexType c = 0; c < 7; ++c) { + for (IndexType r = 0; r < 7; ++r) { + for (IndexType d = 0; d < 32; ++d) { + DataType expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in_col_major(d, r-3+i, c-3+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 64; + sizeDim2 = 13; + sizeDim3 = 13; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 13; ++i) { + for (IndexType j = 0; j < 13; ++j) { + IndexType patchId = i+13*j; + for (IndexType c = 0; c < 3; ++c) { + for (IndexType r = 0; r < 3; ++r) { + for (IndexType d = 0; d < 64; ++d) { + DataType expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in_col_major(d, r-1+i, c-1+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sycl_device.deallocate(gpu_data_l_out_row_major); +} + + +template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +test_simple_image_patch_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_same_sycl<DataType, int64_t>(sycl_device); +test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device); +test_imagenet_patches_sycl<DataType, int64_t>(sycl_device); +} +void test_cxx11_tensor_image_patch_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device)); +} +} diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp new file mode 100644 index 000000000..f2f87f7ed --- /dev/null +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +// Inflation Defenition for each dimention the inflated val would be +//((dim-1)*strid[dim] +1) + +// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to +// tensor of size (2*3) +1 = 7 with the value of +// (4, 0, 0, 4, 0, 0, 4). + +template <typename DataType, int DataLayout, typename IndexType> +void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) { + + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange); + tensor.setRandom(); + + array<IndexType, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_stride = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize); + + VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + IndexType inflatedSizeDim1 = 3; + IndexType inflatedSizeDim2 = 9; + IndexType inflatedSizeDim3 = 9; + IndexType inflatedSizeDim4 = 19; + array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}}; + + Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange); + + const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType); + DataType* gpu_data_inflated = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange); + gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize); + + VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1); + VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2); + VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3); + VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4); + + for (IndexType i = 0; i < inflatedSizeDim1; ++i) { + for (IndexType j = 0; j < inflatedSizeDim2; ++j) { + for (IndexType k = 0; k < inflatedSizeDim3; ++k) { + for (IndexType l = 0; l < inflatedSizeDim4; ++l) { + if (i % strides[0] == 0 && + j % strides[1] == 0 && + k % strides[2] == 0 && + l % strides[3] == 0) { + VERIFY_IS_EQUAL(inflated(i,j,k,l), + tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3])); + } else { + VERIFY_IS_EQUAL(0, inflated(i,j,k,l)); + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_stride); + sycl_device.deallocate(gpu_data_inflated); +} + +template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +void test_cxx11_tensor_inflation_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_inflation_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp new file mode 100644 index 000000000..9e8db8b4b --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, typename IndexType> +static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + + Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange); + Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor<float, 3, ColMajor> tensor(2,3,7); + //tensor.setRandom(); + +// Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template <typename DataType, typename IndexType> +static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange); + Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.swap_layout().device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor<float, 3, ColMajor> tensor(2,3,7); +// tensor.setRandom(); + + //Tensor<float, 3, RowMajor> tensor2(7,3,2); +// tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + + +template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_swap_sycl<DataType, int64_t>(sycl_device); + test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device); +} +void test_cxx11_tensor_layout_swap_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 908a5e5a9..167b75d25 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,12 +13,10 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> + using Eigen::Tensor; template<typename> diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp new file mode 100644 index 000000000..88a29cb31 --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -0,0 +1,249 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array<IndexType, 5> patchTensorRange; + if (DataLayout == ColMajor) { + patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}}; + }else{ + patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}}; + } + + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange); + + tensor.setRandom(); + + array<ptrdiff_t, 4> patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + } else { + VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size()); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), 1); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + + if (DataLayout == ColMajor) { + patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; + }else{ + patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}}; + } + Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange); + patchTensorBuffSize =single_patch.size()*sizeof(DataType); + DataType* gpu_data_single_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange); + + gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + } else { + VERIFY_IS_EQUAL(single_patch.dimension(0), 1); + VERIFY_IS_EQUAL(single_patch.dimension(1), 2); + VERIFY_IS_EQUAL(single_patch.dimension(2), 3); + VERIFY_IS_EQUAL(single_patch.dimension(3), 5); + VERIFY_IS_EQUAL(single_patch.dimension(4), 7); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,2,1,2*2*4*7}}; + }else{ + patchTensorRange = {{2*2*4*7, 1, 2,2,1}}; + } + Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange); + patchTensorBuffSize =twod_patch.size()*sizeof(DataType); + DataType* gpu_data_twod_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange); + + gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + } else { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 1); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + } else { + patch_loc = l + 7 * (k + 4 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0)); + } + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,3,5,2*2*3*3}}; + }else{ + patchTensorRange = {{2*2*3*3, 1, 2,3,5}}; + } + Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange); + patchTensorBuffSize =threed_patch.size()*sizeof(DataType); + DataType* gpu_data_threed_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange); + + gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + } else { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 5); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + } else { + patch_loc = l + 3 * (k + 3 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z)); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_patch); + sycl_device.deallocate(gpu_data_single_patch); + sycl_device.deallocate(gpu_data_twod_patch); + sycl_device.deallocate(gpu_data_threed_patch); +} + +template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +void test_cxx11_tensor_patch_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index b3be199e1..fa1a46732 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 6858b43a7..ec0669704 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index 5f146f3c9..1d4edef11 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -13,12 +13,10 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp new file mode 100644 index 000000000..340d1211c --- /dev/null +++ b/unsupported/test/cxx11_tensor_trace.cpp @@ -0,0 +1,171 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::array; + +template <int DataLayout> +static void test_0D_trace() { + Tensor<float, 0, DataLayout> tensor; + tensor.setRandom(); + array<ptrdiff_t, 0> dims; + Tensor<float, 0, DataLayout> result = tensor.trace(dims); + VERIFY_IS_EQUAL(result(), tensor()); +} + + +template <int DataLayout> +static void test_all_dimensions_trace() { + Tensor<float, 3, DataLayout> tensor1(5, 5, 5); + tensor1.setRandom(); + Tensor<float, 0, DataLayout> result1 = tensor1.trace(); + VERIFY_IS_EQUAL(result1.rank(), 0); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum += tensor1(i, i, i); + } + VERIFY_IS_EQUAL(result1(), sum); + + Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7); + array<ptrdiff_t, 5> dims({{2, 1, 0, 3, 4}}); + Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims); + VERIFY_IS_EQUAL(result2.rank(), 0); + sum = 0.0f; + for (int i = 0; i < 7; ++i) { + sum += tensor2(i, i, i, i, i); + } + VERIFY_IS_EQUAL(result2(), sum); +} + + +template <int DataLayout> +static void test_simple_trace() { + Tensor<float, 3, DataLayout> tensor1(3, 5, 3); + tensor1.setRandom(); + array<ptrdiff_t, 2> dims1({{0, 2}}); + Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1); + VERIFY_IS_EQUAL(result1.rank(), 1); + VERIFY_IS_EQUAL(result1.dimension(0), 5); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum = 0.0f; + for (int j = 0; j < 3; ++j) { + sum += tensor1(j, i, j); + } + VERIFY_IS_EQUAL(result1(i), sum); + } + + Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7); + tensor2.setRandom(); + array<ptrdiff_t, 2> dims2({{2, 3}}); + Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2); + VERIFY_IS_EQUAL(result2.rank(), 2); + VERIFY_IS_EQUAL(result2.dimension(0), 5); + VERIFY_IS_EQUAL(result2.dimension(1), 5); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 7; ++k) { + sum += tensor2(i, j, k, k); + } + VERIFY_IS_EQUAL(result2(i, j), sum); + } + } + + array<ptrdiff_t, 2> dims3({{1, 0}}); + Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3); + VERIFY_IS_EQUAL(result3.rank(), 2); + VERIFY_IS_EQUAL(result3.dimension(0), 7); + VERIFY_IS_EQUAL(result3.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 5; ++k) { + sum += tensor2(k, k, i, j); + } + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } + + Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3); + tensor3.setRandom(); + array<ptrdiff_t, 3> dims4({{0, 2, 4}}); + Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4); + VERIFY_IS_EQUAL(result4.rank(), 2); + VERIFY_IS_EQUAL(result4.dimension(0), 7); + VERIFY_IS_EQUAL(result4.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor3(k, i, k, j, k); + } + VERIFY_IS_EQUAL(result4(i, j), sum); + } + } + + Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5); + tensor4.setRandom(); + array<ptrdiff_t, 2> dims5({{1, 3}}); + Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5); + VERIFY_IS_EQUAL(result5.rank(), 3); + VERIFY_IS_EQUAL(result5.dimension(0), 3); + VERIFY_IS_EQUAL(result5.dimension(1), 4); + VERIFY_IS_EQUAL(result5.dimension(2), 5); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 5; ++k) { + sum = 0.0f; + for (int l = 0; l < 7; ++l) { + sum += tensor4(i, l, j, l, k); + } + VERIFY_IS_EQUAL(result5(i, j, k), sum); + } + } + } +} + + +template<int DataLayout> +static void test_trace_in_expr() { + Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3); + tensor.setRandom(); + array<ptrdiff_t, 2> dims({{1, 3}}); + Tensor<float, 2, DataLayout> result(2, 5); + result = result.constant(1.0f) - tensor.trace(dims); + VERIFY_IS_EQUAL(result.rank(), 2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor(i, k, j, k); + } + VERIFY_IS_EQUAL(result(i, j), 1.0f - sum); + } + } +} + + +void test_cxx11_tensor_trace() { + CALL_SUBTEST(test_0D_trace<ColMajor>()); + CALL_SUBTEST(test_0D_trace<RowMajor>()); + CALL_SUBTEST(test_all_dimensions_trace<ColMajor>()); + CALL_SUBTEST(test_all_dimensions_trace<RowMajor>()); + CALL_SUBTEST(test_simple_trace<ColMajor>()); + CALL_SUBTEST(test_simple_trace<RowMajor>()); + CALL_SUBTEST(test_trace_in_expr<ColMajor>()); + CALL_SUBTEST(test_trace_in_expr<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp new file mode 100644 index 000000000..039715abc --- /dev/null +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -0,0 +1,222 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_volume_patch_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template <typename DataType, typename IndexType> +static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + +IndexType sizeDim0 = 4; +IndexType sizeDim1 = 2; +IndexType sizeDim2 = 3; +IndexType sizeDim3 = 5; +IndexType sizeDim4 = 7; +array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; +array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; +Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); +Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); +tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + + + // single volume patch: ColMajor + array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); + gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); + + array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; + Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); + gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); + + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); + VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + } + + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); +} + +template <typename DataType, typename IndexType> +static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + const int depth = 4; + const int patch_z = 2; + const int patch_y = 3; + const int patch_x = 5; + const int batch = 7; + + array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; + array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; + Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + + // single volume patch: ColMajor + array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; + Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); + gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); + + +// Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch); +// tensor.setRandom(); +// Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout(); + + //Tensor<float, 6> entire_volume_patch; + //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); + +// Tensor<float, 6, RowMajor> entire_volume_patch_row_major; + //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + + array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; + Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); + gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); + + const int dz = patch_z - 1; + const int dy = patch_y - 1; + const int dx = patch_x - 1; + + const int forward_pad_z = dz - dz / 2; + const int forward_pad_y = dy - dy / 2; + const int forward_pad_x = dx - dx / 2; + + for (int pz = 0; pz < patch_z; pz++) { + for (int py = 0; py < patch_y; py++) { + for (int px = 0; px < patch_x; px++) { + const int patchId = pz + patch_z * (py + px * patch_y); + for (int z = 0; z < patch_z; z++) { + for (int y = 0; y < patch_y; y++) { + for (int x = 0; x < patch_x; x++) { + for (int b = 0; b < batch; b++) { + for (int d = 0; d < depth; d++) { + float expected = 0.0f; + float expected_row_major = 0.0f; + const int eff_z = z - forward_pad_z + pz; + const int eff_y = y - forward_pad_y + py; + const int eff_x = x - forward_pad_x + px; + if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && + eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { + expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); + expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); + } + VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); + VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); +} + + + +template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl; +test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device); +test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device); +} +void test_cxx11_tensor_volume_patch_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device)); +} +} |