diff options
author | Gael Guennebaud <g.gael@free.fr> | 2017-02-21 17:04:10 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2017-02-21 17:04:10 +0100 |
commit | b0f55ef85a900a130fe244a0baacd0247db0cb3b (patch) | |
tree | 28b56b9d0cc805d8410ef138c5c5365d5b816f79 | |
parent | d29e9d71192240b7daa5efd477bb72a205b0d9fe (diff) | |
parent | 76687f385c80a4d576d4fadeb271a94d9783b194 (diff) |
merge
87 files changed, 3712 insertions, 945 deletions
diff --git a/Eigen/Core b/Eigen/Core index 1af688637..884546a2b 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -433,6 +433,7 @@ using std::ptrdiff_t; #include "src/Core/util/IndexedViewHelper.h" #include "src/Core/util/ReshapedHelper.h" #include "src/Core/ArithmeticSequence.h" +#include "src/Core/IO.h" #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -482,7 +483,6 @@ using std::ptrdiff_t; #include "src/Core/Redux.h" #include "src/Core/Visitor.h" #include "src/Core/Fuzzy.h" -#include "src/Core/IO.h" #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" #include "src/Core/GeneralProduct.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index 716d52952..131a4edfc 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -59,4 +59,3 @@ #endif // EIGEN_GEOMETRY_MODULE_H /* vim: set filetype=cpp et sw=2 ts=2 ai: */ - diff --git a/Eigen/Sparse b/Eigen/Sparse index a2ef7a665..136e681a1 100644 --- a/Eigen/Sparse +++ b/Eigen/Sparse @@ -25,7 +25,9 @@ #include "SparseCore" #include "OrderingMethods" +#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" +#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/Eigen/StdDeque b/Eigen/StdDeque index be3a7f82b..bc68397be 100644 --- a/Eigen/StdDeque +++ b/Eigen/StdDeque @@ -14,7 +14,7 @@ #include "Core" #include <deque> -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) diff --git a/Eigen/StdList b/Eigen/StdList index 07ba1297b..4c6262c08 100644 --- a/Eigen/StdList +++ b/Eigen/StdList @@ -13,7 +13,7 @@ #include "Core" #include <list> -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) diff --git a/Eigen/StdVector b/Eigen/StdVector index fdfc37766..0c4697ad5 100644 --- a/Eigen/StdVector +++ b/Eigen/StdVector @@ -14,7 +14,7 @@ #include "Core" #include <vector> -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h index 99b954432..ada1571f1 100644 --- a/Eigen/src/Core/ArithmeticSequence.h +++ b/Eigen/src/Core/ArithmeticSequence.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -#if !EIGEN_HAS_CXX11 +#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) template<typename T> struct aseq_negate {}; template<> struct aseq_negate<Index> { @@ -138,7 +138,7 @@ protected: public: -#if EIGEN_HAS_CXX11 +#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); } diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 489935b83..b0ec7b7ca 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> template<typename Kernel> struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; @@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> template<typename Kernel> struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 9cad1aa1e..cd84b07a5 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -463,7 +463,17 @@ template<typename Derived> class DenseBase EIGEN_DEVICE_FUNC void visit(Visitor& func) const; - inline const WithFormat<Derived> format(const IOFormat& fmt) const; + /** \returns a WithFormat proxy object allowing to print a matrix the with given + * format \a fmt. + * + * See class IOFormat for some examples. + * + * \sa class IOFormat, class WithFormat + */ + inline const WithFormat<Derived> format(const IOFormat& fmt) const + { + return WithFormat<Derived>(derived(), fmt); + } /** \returns the unique coefficient of a 1x1 expression */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 82201d96a..7958feeb9 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -13,9 +13,9 @@ #define EIGEN_MATRIXSTORAGE_H #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN; + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN; #else - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) #endif namespace Eigen { @@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt { internal::plain_array<T,Size,_Options> m_data; public: - EIGEN_DEVICE_FUNC DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage() { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} EIGEN_DEVICE_FUNC - DenseStorage(const DenseStorage& other) : m_data(other.m_data) {} + DenseStorage(const DenseStorage& other) : m_data(other.m_data) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { @@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(rows); @@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) @@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam , m_rows(other.m_rows) , m_cols(other.m_cols) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols) internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -403,7 +408,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; m_cols = cols; @@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {} EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); EIGEN_UNUSED_VARIABLE(rows); } @@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols)) , m_cols(other.m_cols) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows) internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -477,7 +483,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_cols = cols; } @@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {} EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); EIGEN_UNUSED_VARIABLE(cols); } @@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols)) , m_rows(other.m_rows) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols) internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -550,7 +557,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; } diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index 644228c3f..da7fd6cce 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -109,20 +109,6 @@ class WithFormat IOFormat m_format; }; -/** \returns a WithFormat proxy object allowing to print a matrix the with given - * format \a fmt. - * - * See class IOFormat for some examples. - * - * \sa class IOFormat, class WithFormat - */ -template<typename Derived> -inline const WithFormat<Derived> -DenseBase<Derived>::format(const IOFormat& fmt) const -{ - return WithFormat<Derived>(derived(), fmt); -} - namespace internal { // NOTE: This helper is kept for backward compatibility with previous code specializing diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 63878428e..8c57a277c 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -19,8 +19,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> > : traits<XprType> { enum { - RowsAtCompileTime = array_size<RowIndices>::value, - ColsAtCompileTime = array_size<ColIndices>::value, + RowsAtCompileTime = int(array_size<RowIndices>::value), + ColsAtCompileTime = int(array_size<ColIndices>::value), MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : int(traits<XprType>::MaxRowsAtCompileTime), MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : int(traits<XprType>::MaxColsAtCompileTime), @@ -29,8 +29,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> > : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : XprTypeIsRowMajor, - RowIncr = get_compile_time_incr<RowIndices>::value, - ColIncr = get_compile_time_incr<ColIndices>::value, + RowIncr = int(get_compile_time_incr<RowIndices>::value), + ColIncr = int(get_compile_time_incr<ColIndices>::value), InnerIncr = IsRowMajor ? ColIncr : RowIncr, OuterIncr = IsRowMajor ? RowIncr : ColIncr, @@ -51,7 +51,7 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> > // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, // but this is too strict regarding negative strides... - DirectAccessMask = (InnerIncr!=UndefinedIncr && OuterIncr!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, + DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 639fb92bf..77f4f6066 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -812,6 +812,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type this->_set_noalias(other); } + // Initialize an arbitrary matrix from an object convertible to the Derived type. + template<typename T> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Derived& other){ + this->_set_noalias(other); + } + // Initialize an arbitrary matrix from a generic Eigen expression template<typename T, typename OtherDerived> EIGEN_DEVICE_FUNC @@ -834,7 +841,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type this->derived() = r; } - // For fixed -size arrays: + // For fixed-size Array<Scalar,...> template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, @@ -846,6 +853,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type Base::setConstant(val0); } + // For fixed-size Array<Index,...> template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Index& val0, diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d392bf3ff..84a56bdcc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -46,7 +46,7 @@ typedef uint32x4_t Packet4ui; const Packet4f p4f_##NAME = pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) + const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1<Packet4i>(X) @@ -83,7 +83,7 @@ template<> struct packet_traits<float> : default_packet_traits HasSqrt = 0 }; }; -template<> struct packet_traits<int> : default_packet_traits +template<> struct packet_traits<int32_t> : default_packet_traits { typedef Packet4i type; typedef Packet4i half; // Packet2i intrinsics not implemented yet @@ -105,11 +105,11 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { @@ -117,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1<Packet4f>(a), countdown); } -template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) { const int32_t i[] = {0, 1, 2, 3}; Packet4i countdown = vld1q_s32(i); @@ -240,20 +240,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { float32x2_t lo, hi; lo = vld1_dup_f32(from); hi = vld1_dup_f32(from+1); return vcombine_f32(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) { int32x2_t lo, hi; lo = vld1_dup_s32(from); @@ -261,11 +261,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) return vcombine_s32(lo, hi); } -template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) { @@ -276,7 +276,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa res = vsetq_lane_f32(from[3*stride], res, 3); return res; } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) { Packet4i res = pset1<Packet4i>(0); res = vsetq_lane_s32(from[0*stride], res, 0); @@ -293,7 +293,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co to[stride*2] = vgetq_lane_f32(from, 2); to[stride*3] = vgetq_lane_f32(from, 3); } -template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) { to[stride*0] = vgetq_lane_s32(from, 0); to[stride*1] = vgetq_lane_s32(from, 1); @@ -301,12 +301,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const to[stride*3] = vgetq_lane_s32(from, 3); } -template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; @@ -361,7 +361,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) return sum; } -template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, sum; @@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) return vget_lane_f32(prod, 0); } -template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, prod; @@ -436,7 +436,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) return vget_lane_f32(min, 0); } -template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, min; @@ -461,7 +461,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) return vget_lane_f32(max, 0); } -template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, max; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5cd2794a4..7122efa60 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -148,7 +148,7 @@ struct tribb_kernel ResMapper res(_res, resStride); gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; - Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert())); // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 3477d7182..c2f084c82 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -104,13 +104,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, // - the sizes are large enough // compute the maximal number of threads from the size of the product: - // FIXME this has to be fine tuned + // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once. Index size = transpose ? rows : cols; - Index pb_max_threads = std::max<Index>(1,size / 32); + Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr); + // compute the maximal number of threads from the total amount of work: double work = static_cast<double>(rows) * static_cast<double>(cols) * static_cast<double>(depth); - double kMinTaskSize = 50000; // Heuristic. + double kMinTaskSize = 50000; // FIXME improve this heuristic. pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize)); // compute the number of threads we are going to use diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8a2f7cd78..6ec5a8a0b 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -284,7 +284,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index ab0550895..29c796647 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -23,7 +23,7 @@ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC 1 + #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) #else #define EIGEN_COMP_GNUC 0 #endif @@ -80,8 +80,8 @@ // 2015 14 1900 // "15" 15 1900 -/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC -#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) +/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl +#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 @@ -349,6 +349,14 @@ # define __has_feature(x) 0 #endif +// Some old compilers do not support template specializations like: +// template<typename T,int N> void foo(const T x[N]); +#if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 +#else +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 +#endif + // Upperbound on the C++ version to use. // Expected values are 03, 11, 14, 17, etc. // By default, let's use an arbitrarily large C++ version. @@ -829,7 +837,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index ee0531b32..90eda6e70 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -97,17 +97,22 @@ template<> struct is_arithmetic<unsigned int> { enum { value = true }; }; template<> struct is_arithmetic<signed long> { enum { value = true }; }; template<> struct is_arithmetic<unsigned long> { enum { value = true }; }; -template<typename T> struct is_integral { enum { value = false }; }; -template<> struct is_integral<bool> { enum { value = true }; }; -template<> struct is_integral<char> { enum { value = true }; }; -template<> struct is_integral<signed char> { enum { value = true }; }; -template<> struct is_integral<unsigned char> { enum { value = true }; }; -template<> struct is_integral<signed short> { enum { value = true }; }; -template<> struct is_integral<unsigned short> { enum { value = true }; }; -template<> struct is_integral<signed int> { enum { value = true }; }; -template<> struct is_integral<unsigned int> { enum { value = true }; }; -template<> struct is_integral<signed long> { enum { value = true }; }; -template<> struct is_integral<unsigned long> { enum { value = true }; }; +#if EIGEN_HAS_CXX11 +using std::is_integral; +#else +template<typename T> struct is_integral { enum { value = false }; }; +template<> struct is_integral<bool> { enum { value = true }; }; +template<> struct is_integral<char> { enum { value = true }; }; +template<> struct is_integral<signed char> { enum { value = true }; }; +template<> struct is_integral<unsigned char> { enum { value = true }; }; +template<> struct is_integral<signed short> { enum { value = true }; }; +template<> struct is_integral<unsigned short> { enum { value = true }; }; +template<> struct is_integral<signed int> { enum { value = true }; }; +template<> struct is_integral<unsigned int> { enum { value = true }; }; +template<> struct is_integral<signed long> { enum { value = true }; }; +template<> struct is_integral<unsigned long> { enum { value = true }; }; +#endif + template <typename T> struct add_const { typedef const T type; }; template <typename T> struct add_const<T&> { typedef T& type; }; diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index ec3b1633e..dc5fae06a 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -250,7 +250,7 @@ template<typename _MatrixType> class ComplexEigenSolver EigenvectorType m_matX; private: - void doComputeEigenvectors(const RealScalar& matrixnorm); + void doComputeEigenvectors(RealScalar matrixnorm); void sortEigenvalues(bool computeEigenvectors); }; @@ -284,10 +284,12 @@ ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool template<typename MatrixType> -void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm) +void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm) { const Index n = m_eivalues.size(); + matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits<RealScalar>::min)()); + // Compute X such that T = X D X^(-1), where D is the diagonal of T. // The matrix X is unit triangular. m_matX = EigenvectorType::Zero(n, n); diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index d6a339f07..f5c86041d 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -248,12 +248,24 @@ template<typename MatrixType> template<typename InputType> RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU) { + const Scalar considerAsZero = (std::numeric_limits<Scalar>::min)(); + eigen_assert(matrix.cols() == matrix.rows()); Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrix.rows(); Scalar scale = matrix.derived().cwiseAbs().maxCoeff(); + if(scale<considerAsZero) + { + m_matT.setZero(matrix.rows(),matrix.cols()); + if(computeU) + m_matU.setIdentity(matrix.rows(),matrix.cols()); + m_info = Success; + m_isInitialized = true; + m_matUisUptodate = computeU; + return *this; + } // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index a9f56c4f5..9ddd553f2 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -414,7 +414,8 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType> if(n==1) { - m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]); + m_eivec = matrix; + m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); if(computeEigenvectors) m_eivec.setOnes(n,n); m_info = Success; diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 39bf8c83d..01a7ed188 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -87,7 +87,8 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec const TriangularView<const VectorsType, UnitLower> V(vectors); // A -= V T V^* A - Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0, + Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime, + (VectorsType::MaxColsAtCompileTime==1 && MatrixType::MaxColsAtCompileTime!=1)?RowMajor:ColMajor, VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat; // FIXME add .noalias() once the triangular product can work inplace if(forward) tmp = T.template triangularView<Upper>() * tmp; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index e0cfb6283..43488b1e0 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -112,9 +112,11 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime> + typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime> TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd) @@ -200,10 +202,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime> + typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime> TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd) diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 1233e164e..8a5cc91f2 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -336,7 +336,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator { do { ++m_cachedIndex; - } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon); + } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<=m_epsilon); if (m_cachedIndex<m_vector.m_end) m_cachedValue = m_vector.m_buffer[m_cachedIndex]; else @@ -347,7 +347,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer); do { m_currentEl = llElements[m_currentEl].next; - } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon); + } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon); if (m_currentEl<0) { m_cachedIndex = -1; @@ -363,9 +363,9 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator protected: const AmbiVector& m_vector; // the target vector - StorageIndex m_currentEl; // the current element in sparse/linked-list mode + StorageIndex m_currentEl; // the current element in sparse/linked-list mode RealScalar m_epsilon; // epsilon used to prune zero coefficients - StorageIndex m_cachedIndex; // current coordinate + StorageIndex m_cachedIndex; // current coordinate Scalar m_cachedValue; // current value bool m_isDense; // mode of the vector }; diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 145a7389e..c41c07af1 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -357,6 +357,16 @@ struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, Itera explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} }; +// "sparse ./ dense" +template<typename T1, typename T2, typename Lhs, typename Rhs> +struct binary_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased> + : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> > +{ + typedef CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> XprType; + typedef sparse_conjunction_evaluator<XprType> Base; + explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} +}; + // "sparse && sparse" template<typename Lhs, typename Rhs> struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased> diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 2d5a4e507..5caf14469 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -78,8 +78,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa class Block, fix, fix<N>(int) /// -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -92,8 +92,8 @@ block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) } /// This is the const version of block(Index,Index,NRowsType,NColsType) -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -124,8 +124,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -138,8 +138,8 @@ topRightCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of topRightCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -229,8 +229,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -243,8 +243,8 @@ topLeftCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -333,8 +333,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -348,8 +348,8 @@ bottomRightCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of bottomRightCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -439,8 +439,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -454,8 +454,8 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of bottomLeftCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template<typename NRowsType, typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type #else @@ -544,8 +544,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -558,8 +558,8 @@ topRows(NRowsType n) } /// This is the const version of topRows(NRowsType). -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -619,8 +619,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -633,8 +633,8 @@ bottomRows(NRowsType n) } /// This is the const version of bottomRows(NRowsType). -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -695,8 +695,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -709,8 +709,8 @@ middleRows(Index startRow, NRowsType n) } /// This is the const version of middleRows(Index,NRowsType). -EIGEN_DEVICE_FUNC template<typename NRowsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type #else @@ -771,8 +771,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -785,8 +785,8 @@ leftCols(NColsType n) } /// This is the const version of leftCols(NColsType). -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -846,8 +846,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -860,8 +860,8 @@ rightCols(NColsType n) } /// This is the const version of rightCols(NColsType). -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -922,8 +922,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -936,8 +936,8 @@ middleCols(Index startCol, NColsType numCols) } /// This is the const version of middleCols(Index,NColsType). -EIGEN_DEVICE_FUNC template<typename NColsType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type #else @@ -1130,8 +1130,8 @@ inline ConstRowXpr row(Index i) const /// /// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block /// -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else @@ -1146,8 +1146,8 @@ segment(Index start, NType n) /// This is the const version of segment(Index,NType). -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else @@ -1180,8 +1180,8 @@ segment(Index start, NType n) const /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else @@ -1195,8 +1195,8 @@ head(NType n) } /// This is the const version of head(NType). -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else @@ -1229,8 +1229,8 @@ head(NType n) const /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else @@ -1244,8 +1244,8 @@ tail(NType n) } /// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC template<typename NType> +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type #else diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index b2cc2944a..22c1666c5 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -7,7 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PARSED_BY_DOXYGEN +#if !defined(EIGEN_PARSED_BY_DOXYGEN) // This file is automatically included twice to generate const and non-const versions @@ -112,6 +112,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); } +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + // The folowing three overloads are needed to handle raw Index[N] arrays. template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices> @@ -138,6 +140,8 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col (derived(), rowIndices, colIndices); } +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + // Overloads for 1D vectors/arrays template<typename Indices> @@ -181,6 +185,8 @@ operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST return Base::operator()(internal::eval_expr_given_size(id,size())); } +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + template<typename IndicesT, std::size_t IndicesN> typename internal::enable_if<IsRowMajor, IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >::type @@ -201,6 +207,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST (derived(), indices, IvcIndex(0)); } +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + #undef EIGEN_INDEXED_VIEW_METHOD_CONST #undef EIGEN_INDEXED_VIEW_METHOD_TYPE @@ -256,5 +264,4 @@ template<typename Indices> IndexedView_or_VectorBlock operator()(const Indices& indices); -#endif // EIGEN_PARSED_BY_DOXYGEN - +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 7eca4d966..6df190869 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -5,29 +5,12 @@ #include "tensor_benchmarks.h" -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; -// Simple functions -template <typename device_selector> -cl::sycl::queue sycl_queue() { - return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - }); -} - #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>(); \ - Eigen::SyclDevice device(q); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \ suite.FUNC(iters); \ } \ diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 07ebed61b..27e5c9b1f 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -138,7 +138,7 @@ else() message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") endif() -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) +set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) # Check if the platform is supported execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 48a28b58a..ed5aed1c8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -151,6 +151,7 @@ ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") ei_add_test(unalignedassert) ei_add_test(vectorization_logic) ei_add_test(basicstuff) +ei_add_test(constructor) ei_add_test(linearstructure) ei_add_test(integer_types) ei_add_test(unalignedcount) diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index 99d91f9da..c346ce6cb 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -49,6 +49,22 @@ template<typename MatrixType> void basicStuff(const MatrixType& m) v1[r] = x; VERIFY_IS_APPROX(x, v1[r]); + // test fetching with various index types. + Index r1 = internal::random<Index>(0, numext::mini(Index(127),rows-1)); + x = v1(static_cast<char>(r1)); + x = v1(static_cast<signed char>(r1)); + x = v1(static_cast<unsigned char>(r1)); + x = v1(static_cast<signed short>(r1)); + x = v1(static_cast<unsigned short>(r1)); + x = v1(static_cast<signed int>(r1)); + x = v1(static_cast<unsigned int>(r1)); + x = v1(static_cast<signed long>(r1)); + x = v1(static_cast<unsigned long>(r1)); +#if EIGEN_HAS_CXX11 + x = v1(static_cast<long long int>(r1)); + x = v1(static_cast<unsigned long long int>(r1)); +#endif + VERIFY_IS_APPROX( v1, v1); VERIFY_IS_NOT_APPROX( v1, 2*v1); VERIFY_IS_MUCH_SMALLER_THAN( vzero, v1); diff --git a/test/constructor.cpp b/test/constructor.cpp new file mode 100644 index 000000000..eec9e2192 --- /dev/null +++ b/test/constructor.cpp @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define TEST_ENABLE_TEMPORARY_TRACKING + +#include "main.h" + +template<typename MatrixType> struct Wrapper +{ + MatrixType m_mat; + inline Wrapper(const MatrixType &x) : m_mat(x) {} + inline operator const MatrixType& () const { return m_mat; } + inline operator MatrixType& () { return m_mat; } +}; + +template<typename MatrixType> void ctor_init1(const MatrixType& m) +{ + // Check logic in PlainObjectBase::_init1 + Index rows = m.rows(); + Index cols = m.cols(); + + MatrixType m0 = MatrixType::Random(rows,cols); + + VERIFY_EVALUATION_COUNT( MatrixType m1(m0), 1); + VERIFY_EVALUATION_COUNT( MatrixType m2(m0+m0), 1); + VERIFY_EVALUATION_COUNT( MatrixType m2(m0.block(0,0,rows,cols)) , 1); + + Wrapper<MatrixType> wrapper(m0); + VERIFY_EVALUATION_COUNT( MatrixType m3(wrapper) , 1); +} + + +void test_constructor() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) ); + CALL_SUBTEST_1( ctor_init1(Matrix4d()) ); + CALL_SUBTEST_1( ctor_init1(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_1( ctor_init1(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); + } + { + Matrix<Index,1,1> a(123); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Matrix<Index,1,1> a(123.0); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Matrix<float,1,1> a(123); + VERIFY_IS_EQUAL(a[0], 123.f); + } + { + Array<Index,1,1> a(123); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Array<Index,1,1> a(123.0); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Array<float,1,1> a(123); + VERIFY_IS_EQUAL(a[0], 123.f); + } + { + Array<Index,3,3> a(123); + VERIFY_IS_EQUAL(a(4), 123); + } + { + Array<Index,3,3> a(123.0); + VERIFY_IS_EQUAL(a(4), 123); + } + { + Array<float,3,3> a(123); + VERIFY_IS_EQUAL(a(4), 123.f); + } +} diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index 8e2bb9ef0..293b1b265 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -131,6 +131,15 @@ template<typename MatrixType> void eigensolver(const MatrixType& m) ComplexEigenSolver<MatrixType> eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + ComplexEigenSolver<MatrixType> ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index e18fbf687..d0e644d4b 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -76,6 +76,15 @@ template<typename MatrixType> void eigensolver(const MatrixType& m) EigenSolver<MatrixType> eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + EigenSolver<MatrixType> ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m) diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 4ed126116..39ad4130e 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -180,6 +180,15 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m) SelfAdjointEigenSolver<MatrixType> eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + SelfAdjointEigenSolver<MatrixType> ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template<int> diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 909d2351d..7245cf378 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -297,7 +297,7 @@ void check_indexed_view() VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); -#if (!EIGEN_COMP_CLANG) || (EIGEN_COMP_CLANG>=308 && !defined(__apple_build_version__)) +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array<int,4>{{3, 1, 6, 5}}, all) ); VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array<int,4>{{3, 1, 6, 5}}) ); VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array<int,3>{{1,3,5}},std::array<int,4>{{3, 1, 6, 5}}) ); diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 3d8d0203d..7f5f71562 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -101,6 +101,12 @@ void test_jacobisvd() // Test on inf/nan matrix CALL_SUBTEST_7( (svd_inf_nan<JacobiSVD<MatrixXf>, MatrixXf>()) ); CALL_SUBTEST_10( (svd_inf_nan<JacobiSVD<MatrixXd>, MatrixXd>()) ); + + // bug1395 test compile-time vectors as input + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,6,1>()) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,6>()) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,Dynamic,1>(r)) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,Dynamic>(c)) )); } CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); diff --git a/test/main.h b/test/main.h index 1d5bdc1c4..25d2dcf43 100644 --- a/test/main.h +++ b/test/main.h @@ -41,6 +41,7 @@ #include <complex> #include <deque> #include <queue> +#include <cassert> #include <list> #if __cplusplus >= 201103L #include <random> @@ -79,10 +80,12 @@ #ifdef TEST_ENABLE_TEMPORARY_TRACKING static long int nb_temporaries; +static long int nb_temporaries_on_assert = -1; inline void on_temporary_creation(long int size) { // here's a great place to set a breakpoint when debugging failures in this test! if(size!=0) nb_temporaries++; + if(nb_temporaries_on_assert>0) assert(nb_temporaries<nb_temporaries_on_assert); } #define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); } diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp index 5ef0d2b2e..7d04d6bba 100644 --- a/test/mpl2only.cpp +++ b/test/mpl2only.cpp @@ -12,7 +12,9 @@ #include <Eigen/SparseCore> #include <Eigen/SparseLU> #include <Eigen/SparseQR> +#include <Eigen/Sparse> #include <Eigen/IterativeLinearSolvers> +#include <Eigen/Eigen> int main() { diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp index 70b469ebc..db1266579 100644 --- a/test/permutationmatrices.cpp +++ b/test/permutationmatrices.cpp @@ -37,8 +37,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m) RightPermutationType rp(rv); MatrixType m_permuted = MatrixType::Random(rows,cols); - const int one_if_dynamic = MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0; - VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, one_if_dynamic); // 1 temp for sub expression "lp * m_original" + VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original" for (int i=0; i<rows; i++) for (int j=0; j<cols; j++) @@ -50,7 +49,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m) VERIFY_IS_APPROX(m_permuted, lm*m_original*rm); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, one_if_dynamic); + VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1); VERIFY_IS_APPROX(m_permuted, lm*m_original*rm); VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original); @@ -75,19 +74,19 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m) // check inplace permutations m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse()); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, lp*m_original); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, m_original*rp); if(rows>1 && cols>1) diff --git a/test/redux.cpp b/test/redux.cpp index 6ddc59c18..989e1057b 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -70,10 +70,10 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m) VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(), Scalar(1)); // test nesting complex expression - VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) ); Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows); m2.setRandom(); - VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1)); } template<typename VectorType> void vectorRedux(const VectorType& w) @@ -156,8 +156,10 @@ void test_redux() CALL_SUBTEST_1( matrixRedux(Array<float, 1, 1>()) ); CALL_SUBTEST_2( matrixRedux(Matrix2f()) ); CALL_SUBTEST_2( matrixRedux(Array2f()) ); + CALL_SUBTEST_2( matrixRedux(Array22f()) ); CALL_SUBTEST_3( matrixRedux(Matrix4d()) ); CALL_SUBTEST_3( matrixRedux(Array4d()) ); + CALL_SUBTEST_3( matrixRedux(Array44d()) ); CALL_SUBTEST_4( matrixRedux(MatrixXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) ); CALL_SUBTEST_4( matrixRedux(ArrayXXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) ); CALL_SUBTEST_5( matrixRedux(MatrixXd (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) ); diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 91b7cb335..384985028 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -161,17 +161,21 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re if(internal::random<bool>()) m1.makeCompressed(); + Index m1_nnz = m1.nonZeros(); + VERIFY_IS_APPROX(m1*s1, refM1*s1); VERIFY_IS_APPROX(m1+m2, refM1+refM2); VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3); VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2)); VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2); + VERIFY_IS_APPROX(m4=m1/s1, refM1/s1); + VERIFY_IS_EQUAL(m4.nonZeros(), m1_nnz); if(SparseMatrixType::IsRowMajor) VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0))); else VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.col(0)), refM1.col(0).dot(refM2.col(0))); - + DenseVector rv = DenseVector::Random(m1.cols()); DenseVector cv = DenseVector::Random(m1.rows()); Index r = internal::random<Index>(0,m1.rows()-2); @@ -208,8 +212,12 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(m1.sum(), refM1.sum()); + m4 = m1; refM4 = m4; + VERIFY_IS_APPROX(m1*=s1, refM1*=s1); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); VERIFY_IS_APPROX(m1/=s1, refM1/=s1); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); VERIFY_IS_APPROX(m1+=m2, refM1+=refM2); VERIFY_IS_APPROX(m1-=m2, refM1-=refM2); @@ -220,13 +228,22 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) ); + m1 = m4; refM1 = refM4; } // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1)); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval())); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval())); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1)); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; if(m1.isCompressed()) { diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 739eacaf3..f3ab561ee 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -231,12 +231,12 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m) Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::RowsAtCompileTime> m1m1 = m1 * m1.transpose(); VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum()); Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows); - VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), (MatrixType::RowsAtCompileTime==Dynamic ? 1 : 0)); + VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), 1); m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval(); m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())); VERIFY_IS_APPROX( m1, m2 ); - VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) ); } void test_vectorwiseop() diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 4cfe300eb..23a74460e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template <typename std::size_t... Indices> +template <typename std::ptrdiff_t... Indices> struct is_input_scalar<Sizes<Indices...> > { static const bool value = (Sizes<Indices...>::total_size == 1); }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 1ba7ef170..f335edf7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -150,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -206,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -218,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet<LoadMode>(index + m_inputOffset); @@ -274,17 +274,29 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } } + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { + return m_dim.actualDim(); + } + + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { + return m_offset; + } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -304,6 +316,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> TensorEvaluator<ArgType, Device> m_impl; const internal::DimensionId<DimId> m_dim; const Device& m_device; +// required by sycl + const DenseIndex m_offset; + }; @@ -344,7 +359,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; @@ -355,7 +370,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> inputIndex += this->m_inputStride; } } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) { + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 442c14fac..bf4a476d9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -64,9 +64,9 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index template<typename LhsScalar, typename RhsScalar, typename Scalar> struct libxsmm_wrapper { libxsmm_wrapper() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {} - void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {} - void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {} + libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} + void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} + void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} }; template<> @@ -220,7 +220,7 @@ struct TensorContractionEvaluatorBase m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), - m_result(NULL), m_expr_indices(op.indices()) { + m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -682,7 +682,9 @@ protected: } m_can_use_xsmm = true; - #endif +#else + EIGEN_UNUSED_VARIABLE(eval_op_indices); +#endif } #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) @@ -842,9 +844,6 @@ protected: TensorEvaluator<EvalRightArgType, Device> m_rightImpl; const Device& m_device; Scalar* m_result; - /// required for sycl - const Indices m_expr_indices; - bool m_can_use_xsmm; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index b170a1a5c..e87de0c57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -22,7 +22,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H namespace Eigen { -template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; +template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; template<typename Indices, typename LeftArgType, typename RightArgType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> : public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > { @@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); + LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, + this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, + this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); } // required by sycl to construct the expr on the device. Returns original left_impl const TensorEvaluator<LeftArgType, Device>& left_impl() const { @@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const TensorEvaluator<RightArgType, Device>& right_impl() const { return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); } - // required by sycl to construct the expr on the device - const Indices& indices() const {return this->m_expr_indices;} }; -/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster. -template<typename Expr> struct TensorEvaluatorContainer; -template<typename Indices, typename LeftArgType, typename RightArgType> -struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{ - typedef Eigen::DefaultDevice Device; - typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType; - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - }; - - typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device) - : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device){} -LeftEvaluator m_leftImpl; -RightEvaluator m_rightImpl; -}; - - -template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, +template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, -int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN, -int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{ - - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - - FunctorExpr functors; +typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN, +typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ + typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr; + LHSFunctorExpr lhs_functors; + RHSFunctorExpr rhs_functors; LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; @@ -206,119 +177,130 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; RightNocontractT m_j_strides, m_right_nocontract_strides; - TupleType tuple_of_accessors; + LHSTupleType left_tuple_of_accessors; + RHSTupleType right_tuple_of_accessors; + Device dev; + - KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_) - :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - tuple_of_accessors(tuple_of_accessors_){} + left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice()); - typedef TensorEvaluatorContainer<DevExpr> DevEvaluator; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr; + auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors); + auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors); + typedef decltype(lhs_dev_expr.expr) LeftArgType; + typedef decltype(rhs_dev_expr.expr) RightArgType; + typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - typename DevEvaluator::LeftEvaluator, LeftNocontractT, + LeftEvaluator, LeftNocontractT, ContractT, 1, lhs_inner_dim_contiguous, false, Unaligned, MakeGlobalPointer> LhsMapper; typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - typename DevEvaluator::RightEvaluator, RightNocontractT, + RightEvaluator, RightNocontractT, ContractT, 1, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), + lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), + rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); // Matmul Kernel // Thread identifiers - const int mLocalThreadId = itemID.get_local(0); // Local ID row - const int nLocalThreadId = itemID.get_local(1); // Local ID col - const int mGroupId = itemID.get_group(0); // Work-group ID row - const int nGroupId = itemID.get_group(1); // Work-group ID localCol - const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID + const Index mLocalThreadId = itemID.get_local(0); // Local ID row + const Index nLocalThreadId = itemID.get_local(1); // Local ID col + const Index mGroupId = itemID.get_group(0); // Work-group ID row + const Index nGroupId = itemID.get_group(1); // Work-group ID localCol + const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space float privateLhs; float privateRhs[WorkLoadPerThreadN]; float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { privateRes[wLPTM][wLPTN] = 0.0f; } } // Tile Lhs - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int - localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; + for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localLhsRow = localLhsLinearId% TileSizeDimM; + Index localLhsCol = localLhsLinearId/TileSizeDimM; // Load the value (wide vector load) - int GlobalLhsColId = TileSizeDimK*0 + localLhsCol; + Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol; localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0); } // Tile Rhs - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; + for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localRhsRow = localRhsLinearId% TileSizeDimN; + Index localRhsCol = localRhsLinearId/TileSizeDimN; // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; + Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0); } // Loop over all tiles - const int numTiles = roundUpK/TileSizeDimK; - int firstHalf=0; + const Index numTiles = roundUpK/TileSizeDimK; + Index firstHalf=0; do { // Synchronise itemID.barrier(cl::sycl::access::fence_space::local_space); // Load the next tile of Lhs and Rhs into local memory - int nextHalf = firstHalf + 1; + Index nextHalf = firstHalf + 1; if (nextHalf < numTiles) { // Tile A - for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localLhsRow = localLhsLinearId% TileSizeDimM; - int localLhsCol = localLhsLinearId/TileSizeDimM; + for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { + Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localLhsRow = localLhsLinearId% TileSizeDimM; + Index localLhsCol = localLhsLinearId/TileSizeDimM; // global K id - int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; + Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; // Store the loaded value into local memory localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0); } // Tile B - for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - int localRhsRow = localRhsLinearId% TileSizeDimN; - int localRhsCol = localRhsLinearId/TileSizeDimN; + for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { + Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; + Index localRhsRow = localRhsLinearId% TileSizeDimN; + Index localRhsCol = localRhsLinearId/TileSizeDimN; // Load the value (wide vector load) - int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; + Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; // Store the loaded vector into local memory localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0); } } // Loop over the values of a single tile - for (int k=0; k<TileSizeDimK; k++) { + for (Index k=0; k<TileSizeDimK; k++) { // Cache the values of localRhs in registers - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; } // Perform the computation - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; } } @@ -327,13 +309,12 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr firstHalf++; } while (firstHalf<numTiles); - // Store the final results in C - for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; + for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { + Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; if (globalRow< M){ - for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; + for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { + Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; if(globalCol<N) out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; } @@ -343,56 +324,73 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr } }; -template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { - -static const int TileSizeDimM = 32; // Tile size for dimension M -static const int TileSizeDimN = 32; // Tile size for dimension N -static const int TileSizeDimK = 16; // Tile size for dimension K -static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M -static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N -static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression +template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { + +static const Index TileSizeDimM = 32ul; // Tile size for dimension M +static const Index TileSizeDimN = 32ul; // Tile size for dimension N +static const Index TileSizeDimK = 16ul; // Tile size for dimension K +static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M +static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N +static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) +static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) +static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression +static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression // RoundUp function to make sure that the global threadId is divisable by local threadId -static int RoundUp(int x, int y) { +static Index RoundUp(Index x, Index y) { return ((((x) + (y) - 1) / (y))*(y)); } -template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> +template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - // create a tuple of accessors from Evaluator + typedef typename Self::XprType HostExpr; - // typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - // typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName; - auto functors = Eigen::TensorSycl::internal::extractFunctors(self); - typedef decltype(functors) FunctorExpr; + typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; + typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr; + typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr; + // extract lhs functor list + LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + // extract rhs functor list + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); + self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self); - typedef decltype(tuple_of_accessors) TupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType; + // create lhs tuple of accessors + LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl()); + // create rhs tuple of accessors + RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl()); + // Local memory for elements of Lhs typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc; LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); // Local memory for elements of Rhs typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); + + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; //OutScalar memory - auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - typedef decltype(out_res) OutAccessor; + OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); + // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, + KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors)); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 860a6949a..b29968b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + protected: template <int LoadMode, bool ActuallyVectorize> struct PacketConv { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index abdf742c6..378f5cccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { + if (static_cast<size_t>(i + 1) < offset) { m_cudaInputStrides[i] = m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_cudaOutputStrides[i] = diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 000000000..4247c1c4a --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,476 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel1D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize, range_x, range_y; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<2> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; + const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); + /// fill the shared memory + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset ; + const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); + if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + const size_t index = plane_kernel_offset+ itemID.get_local(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel2D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); + const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; + + /// fill the shared memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); + if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + for (size_t j = 0; j < kernelSize_y; j++) { + size_t kernel_offset =kernelSize_x * j; + const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); + for (size_t i = 0; i < kernelSize_x; i++) { + result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + + +template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, +typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> +struct EigenConvolutionKernel3D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; +internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , + const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), + kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; + for(size_t p=0; p<numP; p++){ + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) { + for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + (num_x_input * (j + (num_y_input * k))); + const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start ); + if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x + const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y + const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z + + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + CoeffReturnType result = static_cast<CoeffReturnType>(0); + for (size_t k = 0; k < kernelSize_z; k++) { + for (size_t j = 0; j < kernelSize_y; j++) { + for (size_t i = 0; i < kernelSize_x; i++) { + const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); + const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + + enum { + IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value; + internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { + typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + + typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; + // extract input functor list + InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + + + m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { + + typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc; + /// work-around for gcc 4.8 auto bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType; + // create input tuple of accessors + InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); + + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; + KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); + + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; + m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); + const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); + assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range + auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array<Index, 1> indices{{m_indices[0]}}; + const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), + EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 2: { + const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numP = dimensions().TotalSize() / (numX*numY); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; + assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}}; + const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 3: { + const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numZ = dimensions()[m_indices[idxZ]]; + const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); + const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; + const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); + assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, + InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, + numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + }); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 16bbbf894..e209799bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -19,12 +19,9 @@ namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) - template <typename Scalar> class MemCopyFunctor { + template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { public: - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} void operator()(cl::sycl::nd_item<1> itemID) { auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); @@ -62,7 +59,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp + if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); } else{ @@ -133,11 +130,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard<std::mutex> lock(mutex_); auto it = buffer_map.find(static_cast<const uint8_t*>(p)); if (it != buffer_map.end()) { - auto num_bytes =it->second.get_size(); buffer_map.erase(it); - // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version - std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1> - a1.deallocate(static_cast<uint8_t*>(p), num_bytes); } } @@ -158,7 +151,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; } } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl; + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; abort(); } @@ -197,7 +190,12 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template<typename Index> EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2); + tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); + auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); + } rng = n; if (rng==0) rng=static_cast<Index>(1); GRange=rng; @@ -207,6 +205,74 @@ struct SyclDevice { if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); } } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); + } + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } + } + + + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template<typename Index> + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); + } + Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); + tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast<Index>(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast<Index>(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); + } + pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast<Index>(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast<Index>(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); + } + tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast<Index>(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast<Index>(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); + } + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -220,21 +286,23 @@ struct SyclDevice { EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } /// the memcpy function - template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer((void*)src); + template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src)); auto it2 = m_queue_stream->find_buffer(dst); auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(T); - i/=sizeof(T); + offset/=sizeof(Index); + i/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset)); + auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); }); - asynchronousExec(); + synchronize(); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -242,7 +310,7 @@ struct SyclDevice { /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. - template<typename T> EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); ::memcpy(host_acc.get_pointer(), src, n); } @@ -252,20 +320,22 @@ struct SyclDevice { /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. - template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const { + template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { auto it = m_queue_stream->find_buffer(src); auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(T); + offset/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset)); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); }); - asynchronousExec(); + synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -274,7 +344,7 @@ struct SyclDevice { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); - asynchronousExec(); + synchronize(); } struct memsetCghFunctor{ @@ -300,6 +370,24 @@ struct SyclDevice { // there is no l3 cache on cuda devices. return firstLevelCacheSize(); } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); + // return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + + // return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); + // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); + // return stream_->deviceProperties().sharedMemPerBlock; + } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } @@ -307,9 +395,12 @@ struct SyclDevice { sycl_queue().wait_and_throw(); //pass } - EIGEN_STRONG_INLINE void asynchronousExec() const { - sycl_queue().throw_asynchronous();//pass - } + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled + sycl_queue().wait_and_throw(); //pass + + } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -318,6 +409,7 @@ struct SyclDevice { }; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 930837021..abe85c860 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,8 +26,8 @@ namespace Eigen { /// Therefore, by adding the default value, we managed to convert the type and it does not break any /// existing code as its default value is T*. namespace internal { -template<typename XprType, template <class> class MakePointer_> -struct traits<TensorForcedEvalOp<XprType, MakePointer_> > +template<typename XprType> +struct traits<TensorForcedEvalOp<XprType> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> > enum { Flags = 0 }; - template <class T> struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_<T> MakePointerT; - typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - - }; }; -template<typename XprType, template <class> class MakePointer_> -struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense> +template<typename XprType> +struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> { - typedef const TensorForcedEvalOp<XprType, MakePointer_>& type; + typedef const TensorForcedEvalOp<XprType>& type; }; -template<typename XprType, template <class> class MakePointer_> -struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type> +template<typename XprType> +struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> { - typedef TensorForcedEvalOp<XprType, MakePointer_> type; + typedef TensorForcedEvalOp<XprType> type; }; } // end namespace internal -template<typename XprType, template <class> class MakePointer_> -class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors> +template<typename XprType> +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; @@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi }; -template<typename ArgType, typename Device, template <class> class MakePointer_> -struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> +template<typename ArgType, typename Device> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> { - typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType; + typedef TensorForcedEvalOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -150,17 +143,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} private: TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; const Device& m_device; - typename MakePointer<CoeffReturnType>::Type m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 9a012c176..2e638992a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp; -template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp; +template<typename XprType> class TensorForcedEvalOp; template<typename ExpressionType, typename DeviceType> class TensorDevice; template<typename Derived, typename Device> struct TensorEvaluator; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 485a082e2..ef1c9c42c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); +#elif defined(__SYCL_DEVICE_ONLY__) + return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); #else uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); return (static_cast<uint32_t>(v >> 32) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d582ccbe1..6ddd2ca18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, { typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; static const int NumDims = internal::array_size<Strides>::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -733,7 +739,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); }else{ - /* implies m_strides[i]<0 by assert */ + /* implies m_strides[i]<0 by assert */ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); } @@ -796,13 +802,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, sizeof(Scalar)); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -858,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { +#ifndef __SYCL_DEVICE_ONLY__ return numext::maxi(min, numext::mini(max,value)); +#else + return cl::sycl::clamp(value, min, max); +#endif } array<Index, NumDims> m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c9912d9d4..c3ca129e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,11 +25,11 @@ namespace Eigen { namespace internal { -template<typename CoeffReturnType> struct syclGenericBufferReducer{ +template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> -static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -54,11 +54,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de length = length / local; } while (length > 1); +} +}; - +template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ +template<typename BufferTOut, typename BufferTIn> +static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ + syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), + bufOut, bufI, dev, length, local); } - }; /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. @@ -74,8 +79,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - typedef decltype(functors) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; + FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -108,9 +113,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is a workaround for gcc 4.8 bug + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef decltype(tuple_of_accessors) TupleType; + TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); typedef decltype(tmp_global_accessor) OutAccessor; cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), @@ -122,7 +128,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -134,10 +140,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { + static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - typedef decltype(functors) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; + FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); typename Self::Index range, GRange, tileSize; typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; @@ -147,14 +153,15 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> { /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is workaround for gcc 4.8 bug. + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc; + Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); - + Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); + (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 14e392e36..e430b0826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from sycl device + ReverseDimensions functor() const { return m_reverse; } + protected: Dimensions m_dimensions; array<Index, NumDims> m_strides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index f8121d17b..2854a4a17 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -126,7 +126,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> } else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_dimensions = nbDimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 6c35bfdb6..2237140e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); + m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + Strides functor() const { return m_strides; } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -250,9 +255,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; + const Strides m_strides; }; - // Eval as lvalue template<typename Strides, typename ArgType, typename Device> struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> @@ -286,6 +291,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 2e61ee049..9d5a6d4c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -35,7 +35,7 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; + template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; /// This struct is used for special expression nodes with no operations (for example assign and selectOP). @@ -80,6 +80,9 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +/// this is used for extracting tensor convolution +#include "TensorConvolutionSycl.h" + // kernel execution using fusion #include "TensorSyclRun.h" //sycl functors diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 113dd2557..ee8f3c9c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -97,8 +97,18 @@ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) -KERNELBROKERCONVERT(, false, TensorForcedEvalOp) +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp +#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ +template <typename Expr>\ +struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ + typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\ +}; +KERNELBROKERCONVERTFORCEDEVAL(const) +KERNELBROKERCONVERTFORCEDEVAL() +#undef KERNELBROKERCONVERTFORCEDEVAL + + + KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT @@ -136,6 +146,18 @@ KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp +#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ +template <DenseIndex DimId, typename Expr>\ +struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\ + typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\ +}; +KERNELBROKERCONVERTCHIPPINGOP(const) +KERNELBROKERCONVERTCHIPPINGOP() +#undef KERNELBROKERCONVERTCHIPPINGOP + + + } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index df1a732e7..3b83b1d2c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN(const) ASSIGN() #undef ASSIGN + + + + + /// specialisation of the \ref ExprConstructor struct when the node type is + /// const TensorAssignOp + #define CONVERSIONEXPRCONST(CVQual)\ + template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\ + struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\ + typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\ + typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\ + my_nested_type nestedExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ + }; + + CONVERSIONEXPRCONST(const) + CONVERSIONEXPRCONST() + #undef CONVERSIONEXPRCONST + /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorEvalToOp /// 0 here is the output number in the buffer #define EVALTO(CVQual)\ @@ -212,10 +234,10 @@ EVALTO() /// TensorForcedEvalOp #define FORCEDEVAL(CVQual)\ template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\ +struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\ - TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ + typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ + TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ @@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorContractionOp +#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ +struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ +CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ + static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ + typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ + NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ + typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ + Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ + :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ +}; + +SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTION + + #define SYCLSLICEOPEXPR(CVQual)\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ @@ -322,6 +368,23 @@ SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) #undef SYCLPADDINGOPEXPRCONST +// TensorChippingOp +#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ +template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ +struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\ + typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ + typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template <typename FuncDetector>\ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\ +}; + +SYCLTENSORCHIPPINGOPEXPR(const) +SYCLTENSORCHIPPINGOPEXPR() +#undef SYCLTENSORCHIPPINGOPEXPR + /// template deduction for \ref ExprConstructor struct template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 876fcd45e..b512d43f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -35,6 +35,8 @@ namespace Eigen { namespace TensorSycl { namespace internal { +#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} + /// \struct ExtractAccessor: Extract Accessor Class is used to extract the /// accessor from a buffer. /// Depending on the type of the leaf node we can get a read accessor or a @@ -44,22 +46,16 @@ struct ExtractAccessor; struct AccessorConstructor{ template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) { - return ExtractAccessor<Arg>::getTuple(cgh, eval); - } + RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval)) template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) { - return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) + template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) { - return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) + template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){ - return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())); - } + RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))) }; /// specialisation of the \ref ExtractAccessor struct when the node type is @@ -68,9 +64,7 @@ struct AccessorConstructor{ template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ +RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLUNARYCATEGORYEXTACC(const) @@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC() template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLBINARYCATEGORYEXTACC(const) @@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC() template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\ - return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ }; SYCLTERNARYCATEGORYEXTACC(const) @@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC() template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ }; SYCLSELECTOPEXTACC(const) @@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC() template <typename LHSExpr, typename RHSExpr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLTENSORASSIGNOPEXTACC(const) @@ -142,9 +128,7 @@ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, template <typename PlainObjectType, int Options_, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\ }; TENSORMAPEXPR(const, cl::sycl::access::mode::read) @@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write) template <typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; SYCLFORCEDEVALEXTACC(const) @@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC() template <typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\ - -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\ - return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\ - }\ + RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ }; SYCLEVALTOEXTACC(const) @@ -185,43 +165,66 @@ SYCLEVALTOEXTACC() template <typename OP, typename Dim, typename Expr, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\ - return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ }; SYCLREDUCTIONEXTACC(const) SYCLREDUCTIONEXTACC() #undef SYCLREDUCTIONEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp +#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ +template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\ + struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +}; + +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONEXTACC + + /// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. This is a special case where there is no OP +/// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICEOPEXTACC(const) SYCLSLICEOPEXTACC() #undef SYCLSLICEOPEXTACC - +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorStridingSlicingOp. #define SYCLSLICESTRIDEOPEXTACC(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICESTRIDEOPEXTACC(const) SYCLSLICESTRIDEOPEXTACC() #undef SYCLSLICESTRIDEOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorChippingOp. +#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\ +template<DenseIndex DimId, typename XprType, typename Dev>\ +struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORCHIPPINGOPEXTACC(const) +SYCLTENSORCHIPPINGOPEXTACC() +#undef SYCLTENSORCHIPPINGOPEXTACC + /// template deduction for \ref ExtractAccessor template <typename Evaluator> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 6f9ab57af..ee020184b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{ }; +/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything +///TensorConversionOp +#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ +template <typename ArgType1, typename ArgType2, typename Dev>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\ + FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\ + : subExpr(expr.impl()) {}\ +}; + +SYCLEXTRFUNCCONVERSION(TensorConversionOp, const) +SYCLEXTRFUNCCONVERSION(TensorConversionOp, ) +#undef SYCLEXTRFUNCCONVERSION + #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\ template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\ struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\ @@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ +template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ + typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\ + typedef typename Evaluator::Dimensions Dimensions;\ + const Dimensions m_dimensions;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\ + : m_dimensions(expr.dimensions()) {}\ +}; + + +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) +#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP + /// specialisation of the \ref FunctorExtractor struct when the node type is /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCTSLICEOP(CVQual)\ @@ -253,14 +285,27 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ }; -// TensorContractionOp -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const) -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),) // TensorConcatenationOp SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) #undef SYCLEXTRFUNCCONTRACTCONCAT +//TensorChippingOp +#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ +template<DenseIndex DimId, typename XprType, typename Device>\ +struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\ + FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ + const DenseIndex m_dim;\ + const DenseIndex m_offset;\ + EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\ + EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\ + FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\ + : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\ +}; + +SYCLEXTRFUNCCHIPPINGOP(const) +SYCLEXTRFUNCCHIPPINGOP() +#undef SYCLEXTRFUNCCHIPPINGOP /// template deduction function for FunctorExtractor template <typename Evaluator> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 710e22474..2f7779036 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -18,13 +18,14 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ + template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ + OP op; OutputAccessor aOut; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -44,7 +45,12 @@ namespace internal { auto min = (length < local) ? length : local; for (size_t offset = min / 2; offset > 0; offset /= 2) { if (localid < offset) { - scratch[localid] += scratch[localid + offset]; + auto accum = op.initialize(); + op.reduce(scratch[localid], &accum); + op.reduce(scratch[localid + offset], &accum); + op.finalize(accum); + scratch[localid]=accum; + //scratch[localid] += scratch[localid + offset]; } itemID.barrier(cl::sycl::access::fence_space::local_space); } @@ -66,7 +72,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen public: typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_) + ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { @@ -99,6 +105,46 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen Index range; }; +template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index> +class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { + public: + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; + typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; + ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) + :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; + auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=static_cast<Index>(itemID.get_global_linear_id()); + if (globalid< range) { + typename DeviceSelf::CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); + functor.finalize(accum); + output_accessor_ptr[globalid]= accum/num_values_to_reduce; + } + } + private: + write_accessor output_accessor; + FunctorExpr functors; + Tuple_of_Acc tuple_of_accessors; + Dims dims; + Op functor; + Index range; + Index num_values_to_reduce; +}; template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> class FullReductionKernelFunctor{ @@ -128,18 +174,70 @@ public: /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); - if(globalid<rng) - tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)); - else - tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); + tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)) + : static_cast<CoeffReturnType>(op.initialize()); - if(remaining!=0 && globalid==0 ) + if(remaining!=0 && globalid==0 ){ // this will add the rest of input buffer when the input size is not devidable to red_factor. - tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); + auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>:: + reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); + auto accum = op.initialize(); + op.reduce(tmp_global_accessor.get_pointer()[0], &accum); + op.reduce(remaining_reduce, &accum); + op.finalize(accum); + tmp_global_accessor.get_pointer()[0]=accum; + + } } }; +template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType> +class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{ +public: + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + typedef Eigen::internal::SumReducer<CoeffReturnType> Op; + + OutAccessor tmp_global_accessor; + Index rng , remaining, red_factor; + Op op; + Dims dims; + FunctorExpr functors; + TupleType tuple_of_accessors; + + FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc) + :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} + + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + auto scale = (rng*red_factor) + remaining; + + tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale) + :static_cast<CoeffReturnType>(op.initialize())/scale; + + if(remaining!=0 && globalid==0 ){ + // this will add the rest of input buffer when the input size is not devidable to red_factor. + auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); + auto accum = op.initialize(); + tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale; + op.reduce(tmp_global_accessor.get_pointer()[0], &accum); + op.reduce(remaining_reduce, &accum); + op.finalize(accum); + tmp_global_accessor.get_pointer()[0]=accum/scale; + } + } +}; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 37fe196ea..a1c112f4d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const) REDUCTIONLEAFCOUNT() #undef REDUCTIONLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp +#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\ +template <typename Indices, typename LhsXprType, typename RhsXprType>\ +struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\ + static const size_t Count =1;\ +}; + +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) +#undef CONTRACTIONCONVOLUTIONLEAFCOUNT + + + /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType>\ @@ -124,6 +139,17 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT + +/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp +#define CHIPPINGOPLEAFCOUNT(CVQual)\ +template <DenseIndex DimId, typename XprType>\ +struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{}; + +CHIPPINGOPLEAFCOUNT(const) +CHIPPINGOPLEAFCOUNT() +#undef CHIPPINGOPLEAFCOUNT + + #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 4419a1780..74566dcee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -157,6 +157,18 @@ EVALTO() /// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorChippingOp +#define CHIPPINGOP(CVQual)\ +template <DenseIndex DimId, typename Expr, size_t N>\ +struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\ + typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\ +}; + +CHIPPINGOP(const) +CHIPPINGOP() +#undef CHIPPINGOP + +/// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLREDUCTION(CVQual)\ template <typename OP, typename Dims, typename Expr, size_t N>\ @@ -169,6 +181,20 @@ SYCLREDUCTION() /// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorReductionOp +#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ +template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\ +struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\ + typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\ +}; +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONPLH + + +/// specialisation of the \ref PlaceHolderExpression when the node is /// TensorCwiseSelectOp #define SLICEOPEXPR(CVQual)\ template <typename StartIndices, typename Sizes, typename XprType, size_t N>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 32930be26..cac785540 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -49,19 +49,38 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. +/// std::array does not have TotalSize. So I have to get the size through template specialisation. +template<typename , typename Dimensions> struct DimensionSize{ + static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){ + return dim.TotalSize(); + } +}; +#define DIMSIZEMACRO(CVQual)\ +template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\ + static inline Index getDimSize(const std::array<Index, NumDims>& dim){\ + return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\ + }\ +}; + +DIMSIZEMACRO(const) +DIMSIZEMACRO() +#undef DIMSIZEMACRO + + template <typename Expr, typename Dev> void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr; FunctorExpr functors = internal::extractFunctors(evaluator); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator - typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType; - TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator); + typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType; + TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator); typename Expr::Index range, GRange, tileSize; - dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange); + typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions())); + dev.parallel_for_setup(total_size, tileSize, range, GRange); cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 4bb1852b6..bb6d9e1fe 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -61,10 +61,11 @@ struct MatrixExponentialScalingOp * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template <typename MatrixType> -void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V) { - typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar; const RealScalar b[] = {120.L, 60.L, 12.L, 1.L}; const MatrixType A2 = A * A; const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); @@ -77,9 +78,10 @@ void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template <typename MatrixType> -void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L}; const MatrixType A2 = A * A; @@ -94,9 +96,10 @@ void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template <typename MatrixType> -void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L}; const MatrixType A2 = A * A; @@ -114,9 +117,10 @@ void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template <typename MatrixType> -void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L, 2162160.L, 110880.L, 3960.L, 90.L, 1.L}; @@ -135,9 +139,10 @@ void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template <typename MatrixType> -void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L, 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L, @@ -162,9 +167,10 @@ void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V) * This function activates only if your long double is double-double or quadruple. */ #if LDBL_MANT_DIG > 64 -template <typename MatrixType> -void matrix_exp_pade17(const MatrixType &A, MatrixType &U, MatrixType &V) +template <typename MatA, typename MatU, typename MatV> +void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L, 100610229646136770560000.L, 15720348382208870400000.L, @@ -204,7 +210,8 @@ struct matrix_exp_computeUV template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, float> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -227,7 +234,8 @@ struct matrix_exp_computeUV<MatrixType, float> template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, double> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -254,7 +262,8 @@ struct matrix_exp_computeUV<MatrixType, double> template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, long double> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { #if LDBL_MANT_DIG == 53 // double precision matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings); @@ -339,9 +348,10 @@ struct matrix_exp_computeUV<MatrixType, long double> * \param arg argument of matrix exponential (should be plain object) * \param result variable in which result will be stored */ -template <typename MatrixType, typename ResultType> -void matrix_exp_compute(const MatrixType& arg, ResultType &result) +template <typename ArgType, typename ResultType> +void matrix_exp_compute(const ArgType& arg, ResultType &result) { + typedef typename ArgType::PlainObject MatrixType; #if LDBL_MANT_DIG > 112 // rarely happens typedef typename traits<MatrixType>::Scalar Scalar; typedef typename NumTraits<Scalar>::Real RealScalar; @@ -354,7 +364,7 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result) MatrixType U, V; int squarings; matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - MatrixType numer = U + V; + MatrixType numer = U + V; MatrixType denom = -U + V; result = denom.partialPivLu().solve(numer); for (int i=0; i<squarings; i++) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a087f4759..003c9de0b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -163,6 +163,10 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index c426549f1..21fdfca22 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -131,11 +131,6 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl:: std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - - test_broadcast_sycl_fixed<DataType, RowMajor, int>(sycl_device); - test_broadcast_sycl<DataType, RowMajor, int>(sycl_device); - test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device); - test_broadcast_sycl<DataType, ColMajor, int>(sycl_device); test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device); test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device); test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index d5193d1ea..400a31d09 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -32,20 +32,20 @@ template <typename T> T cube(T x) { return x * x * x; } template <typename T> T inverse(T x) { return 1 / x; } } -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR) \ +#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ { \ /* out OPERATOR in.FUNC() */ \ - Tensor<SCALAR, 3> in(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in = in.random() + static_cast<SCALAR>(0.01); \ out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data = static_cast<SCALAR *>( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ @@ -53,7 +53,7 @@ template <typename T> T inverse(T x) { return 1 / x; } gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(in(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -63,18 +63,18 @@ template <typename T> T inverse(T x) { return 1 / x; } } \ { \ /* out OPERATOR out.FUNC() */ \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ (out.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(reference(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -82,61 +82,62 @@ template <typename T> T inverse(T x) { return 1 / x; } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR) +#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC) \ +#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ { \ /* out = in.FUNC() */ \ - Tensor<SCALAR, 3> in(tensorRange); \ - Tensor<bool, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ + Tensor<bool, 3, Layout, int64_t> out(tensorRange); \ in = in.random() + static_cast<SCALAR>(0.01); \ SCALAR *gpu_data = static_cast<SCALAR *>( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ bool *gpu_data_out = \ static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<bool, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ + TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(bool)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ } \ sycl_device.deallocate(gpu_data); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS(SCALAR) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf) +#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float) + TEST_UNARY_BUILTINS(float, RowMajor) + TEST_UNARY_BUILTINS(float, ColMajor) } namespace std { @@ -144,24 +145,24 @@ template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); } template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC) \ +#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ { \ /* out = in_1.FUNC(in_2) */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> in_2(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -169,7 +170,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver = std::FUNC(in_1(i), in_2(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -179,24 +180,24 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR in_2 */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> in_2(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -204,7 +205,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ } \ sycl_device.deallocate(gpu_data_1); \ @@ -212,46 +213,48 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR 2 */ \ - Tensor<SCALAR, 3> in_1(tensorRange); \ - Tensor<SCALAR, 3> out(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ + Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3> reference(out); \ + Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast<SCALAR *>( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ + TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ } \ sycl_device.deallocate(gpu_data_1); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS(SCALAR) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /) +#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %) + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + TEST_BINARY_BUILTINS(float, RowMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) + TEST_BINARY_BUILTINS(float, ColMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) } void test_cxx11_tensor_builtins_sycl() { diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 1832dec8b..89cf5c7b7 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -43,7 +43,7 @@ static void test_simple_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -75,7 +75,7 @@ static void test_simple_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } @@ -126,7 +126,7 @@ static void test_dynamic_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -158,7 +158,7 @@ static void test_dynamic_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp new file mode 100644 index 000000000..39e4f0a7f --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -0,0 +1,622 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) { + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange); + tensor.setRandom(); + tensor1.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + DataType* gpu_data_tensor1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange); + + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1; + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim3; ++j) { + for (int k = 0; k < sizeDim4; ++k) { + for (int l = 0; l < sizeDim5; ++l) { + float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l); + VERIFY_IS_EQUAL(chip1(i,j,k,l), expected); + } + } + } + } + + array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange); + Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange); + tensor2.setRandom(); + const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType); + DataType* gpu_data_tensor2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange); + TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize); + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2; + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim4; ++j) { + for (int k = 0; k < sizeDim5; ++k) { + float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k); + VERIFY_IS_EQUAL(chip2(i,j,k), expected); + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_tensor1); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_tensor2); + sycl_device.deallocate(gpu_data_chip2); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange); + input1.setRandom(); + input2.setRandom(); + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize); + gpu_tensor.device(sycl_device)=gpu_input1; + sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize); + gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange); + input3.setRandom(); + + const size_t input3TensorBuffSize =input3.size()*sizeof(DataType); + DataType* gpu_data_input3 = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize); + gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (j != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange); + input4.setRandom(); + + const size_t input4TensorBuffSize =input4.size()*sizeof(DataType); + DataType* gpu_data_input4 = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize); + gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (k != 3) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange); + input5.setRandom(); + + const size_t input5TensorBuffSize =input5.size()*sizeof(DataType); + DataType* gpu_data_input5 = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize); + gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (l != 4) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); + } + } + } + } + } + } + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange); + input6.setRandom(); + + const size_t input6TensorBuffSize =input6.size()*sizeof(DataType); + DataType* gpu_data_input6 = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize); + gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (m != 5) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); + } + } + } + } + } + } + + + gpu_tensor.device(sycl_device)=gpu_input1; + Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange); + input7.setRandom(); + + DataType* gpu_data_input7 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize); + gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l); + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (i != 0) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_input1); + sycl_device.deallocate(gpu_data_input2); + sycl_device.deallocate(gpu_data_input3); + sycl_device.deallocate(gpu_data_input4); + sycl_device.deallocate(gpu_data_input5); + sycl_device.deallocate(gpu_data_input6); + sycl_device.deallocate(gpu_data_input7); + +} + +template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device); + test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device); + test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +void test_cxx11_tensor_chipping_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_chipping_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp index 5a324b44c..e3023a368 100644 --- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,39 +22,39 @@ using Eigen::Tensor; -template<typename DataType, int DataLayout, typename Index> +template<typename DataType, int DataLayout, typename IndexType> static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Index leftDim3 = 1; - Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Index rightDim3 = 1; - Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; - - //Index concatDim1 = 3; -// Index concatDim2 = 3; -// Index concatDim3 = 1; - //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; - - Tensor<DataType, 3, DataLayout, Index> left(leftRange); - Tensor<DataType, 3, DataLayout, Index> right(rightRange); + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + IndexType leftDim3 = 1; + Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + IndexType rightDim3 = 1; + Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; + + //IndexType concatDim1 = 3; +// IndexType concatDim2 = 3; +// IndexType concatDim3 = 1; + //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; + + Tensor<DataType, 3, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 3, DataLayout, IndexType> right(rightRange); left.setRandom(); right.setRandom(); DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); /// - Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); //concatenation = left.concatenate(right, 0); gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); @@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); - for (int j = 0; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); } - for (int i = 2; i < 4; ++i) { + for (IndexType i = 2; i < 4; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); } } sycl_device.deallocate(gpu_out_data1); - Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); @@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); } - for (int j = 3; j < 6; ++j) { + for (IndexType j = 3; j < 6; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); } } sycl_device.deallocate(gpu_out_data2); - Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); @@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); } @@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) sycl_device.deallocate(gpu_in1_data); sycl_device.deallocate(gpu_in2_data); } -template<typename DataType, int DataLayout, typename Index> +template<typename DataType, int DataLayout, typename IndexType> static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}}; + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}}; - Index concatDim1 = 4; - Index concatDim2 = 3; - Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}}; + IndexType concatDim1 = 4; + IndexType concatDim2 = 3; + Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}}; - Tensor<DataType, 2, DataLayout, Index> left(leftRange); - Tensor<DataType, 2, DataLayout, Index> right(rightRange); - Tensor<DataType, 2, DataLayout, Index> result(resRange); + Tensor<DataType, 2, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 2, DataLayout, IndexType> right(rightRange); + Tensor<DataType, 2, DataLayout, IndexType> result(resRange); left.setRandom(); right.setRandom(); @@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); @@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(left(i, j), result(i, j)); VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); } @@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_concatenation<DataType, RowMajor, int>(sycl_device); - test_simple_concatenation<DataType, ColMajor, int>(sycl_device); - test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device); + test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device); + test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device); + test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_concatenation_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 0221da110..5bace66c5 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include <iostream> @@ -28,104 +28,172 @@ using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -typedef Tensor<float, 1>::DimensionPair DimPair; -template<int DataLayout, typename Device> -void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size) +template<int DataLayout, typename DataType, typename IndexType, typename Device> +void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 2, DataLayout> t_result(m_size, n_size); - Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); // Eigen::array<DimPair, 1> dims(DimPair(1, 0)); Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; - Eigen::array<int, 2> left_dims = {{m_size, k_size}}; - Eigen::array<int, 2> right_dims = {{k_size, n_size}}; - Eigen::array<int, 2> result_dims = {{m_size, n_size}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { + continue; + } + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template<int DataLayout, typename DataType, typename IndexType, typename Device> +void test_TF(const Device& sycl_device) +{ + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Eigen::array<IndexType, 2> left_dims = {{2, 3}}; + Eigen::array<IndexType, 2> right_dims = {{3, 1}}; + Eigen::array<IndexType, 2> res_dims = {{2, 1}}; + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.data()[0] = 1.0f; + t_left.data()[1] = 2.0f; + t_left.data()[2] = 3.0f; + t_left.data()[3] = 4.0f; + t_left.data()[4] = 5.0f; + t_left.data()[5] = 6.0f; + + t_right.data()[0] = -1.0f; + t_right.data()[1] = 0.5f; + t_right.data()[2] = 2.0f; + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size()*sizeof(DataType); + + + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) << " vs " << t_result_gpu(i) << std::endl; assert(false); } sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); -} -template<int DataLayout, typename Device> -void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) +} + +template<int DataLayout, typename DataType, typename IndexType, typename Device> +void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 0, DataLayout> t_result; - Tensor<float, 0, DataLayout> t_result_gpu; + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 0, DataLayout, IndexType> t_result; + Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu; Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; - Eigen::array<int, 2> left_dims = {{m_size, k_size}}; - Eigen::array<int, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = sizeof(DataType); - float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); - if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f && - !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { std::cout << "mismatch detected: " << t_result() << " vs " << t_result_gpu() << std::endl; assert(false); @@ -137,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_m(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_k(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_n(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k); } } -template<int DataLayout, typename Device> +template<int DataLayout, typename DataType, typename IndexType, typename Device> void test_sycl_contraction_sizes(const Device& sycl_device) { - int m_sizes[] = { 31, 39, 63, 64, 65, + IndexType m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; - int n_sizes[] = { 31, 39, 63, 64, 65, + IndexType n_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - int k_sizes[] = { 31, 39, 63, 64, 65, + IndexType k_sizes[] = { 31, 39, 63, 64, 65, 95, 96, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); } } } @@ -186,24 +254,27 @@ void test_sycl_contraction_sizes(const Device& sycl_device) { template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32); - test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32); - test_scalar<ColMajor>(sycl_device, 32, 32, 32); - test_scalar<RowMajor>(sycl_device, 32, 32, 32); + test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32); + test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); + test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32); + test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128); - test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128); - test_scalar<ColMajor>(sycl_device, 128, 128, 128); - test_scalar<RowMajor>(sycl_device, 128, 128, 128); - test_sycl_contraction_m<ColMajor>(sycl_device); - test_sycl_contraction_m<RowMajor>(sycl_device); - test_sycl_contraction_n<ColMajor>(sycl_device); - test_sycl_contraction_n<RowMajor>(sycl_device); - test_sycl_contraction_k<ColMajor>(sycl_device); - test_sycl_contraction_k<RowMajor>(sycl_device); - test_sycl_contraction_sizes<ColMajor>(sycl_device); - test_sycl_contraction_sizes<RowMajor>(sycl_device); + test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); + test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); + test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); + test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); + test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device); + test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device); + test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device); + test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device); + test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device); + test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device); + test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device); + test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device); + test_TF<RowMajor, float, int64_t>(sycl_device); + test_TF<ColMajor, float, int64_t>(sycl_device); + end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds = end-start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); @@ -211,6 +282,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s << "elapsed time: " << elapsed_seconds.count() << "s\n"; } + void test_cxx11_tensor_contract_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorContractionPerDevice(device)); diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp new file mode 100644 index 000000000..a4226a63a --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -0,0 +1,469 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include <iostream> +#include <chrono> +#include <ctime> + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> +#include <iomanip> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +static const float error_threshold =1e-4f; + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=55; + IndexType outdim2=51; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 1> kernel_dims = {{4}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 1> dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=51; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 2> kernel_dims = {{4,5}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 2> dims3{{0,1}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=49; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 3> dims3{{0,1,2}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_evals(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array<IndexType, 2> input_dims = {{3, 3}}; + Eigen::array<IndexType, 1> kernel_dims = {{2}}; + Eigen::array<IndexType, 2> result_dims = {{2, 3}}; + + Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 2, DataLayout,IndexType> result(result_dims); + + Eigen::array<IndexType, 1> dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_expr(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array<IndexType, 2> input_dims = {{3, 3}}; + Eigen::array<IndexType, 2> kernel_dims = {{2, 2}}; + Eigen::array<IndexType, 2> result_dims = {{2, 2}}; + + Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims); + Tensor<DataType, 2, DataLayout, IndexType> result(result_dims); + + input.setRandom(); + kernel.setRandom(); + Eigen::array<IndexType, 2> dims; + dims[0] = 0; + dims[1] = 1; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_modes(const Eigen::SyclDevice& sycl_device){ + +Eigen::array<IndexType, 1> input_dims = {{3}}; +Eigen::array<IndexType, 1> kernel_dims = {{3}}; + +Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); +Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); + +input.setRandom(); +kernel.setRandom(); +Eigen::array<IndexType, 1> dims; +dims[0] = 0; + + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + Eigen::array<std::pair<IndexType, IndexType>, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor<DataType, 1, DataLayout, IndexType> valid(1); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t valid_bytes = valid.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes); + + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor<DataType, 1, DataLayout, IndexType> same(3); + std::size_t same_bytes = same.size() * sizeof(DataType); + DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes)); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions()); + gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes); + + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + + Tensor<DataType, 1, DataLayout, IndexType> full(5); + std::size_t full_bytes = full.size() * sizeof(DataType); + DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes)); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions()); + gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes); + + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_valid); + sycl_device.deallocate(d_same); + sycl_device.deallocate(d_full); + +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_strides(const Eigen::SyclDevice& sycl_device){ + + Eigen::array<IndexType, 1> input_dims = {{13}}; + Eigen::array<IndexType, 1> kernel_dims = {{3}}; + + Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); + Tensor<DataType, 1, DataLayout, IndexType> result(2); + + input.setRandom(); + kernel.setRandom(); + Eigen::array<IndexType, 1> dims; + dims[0] = 0; + + Eigen::array<IndexType, 1> stride_of_3; + stride_of_3[0] = 3; + Eigen::array<IndexType, 1> stride_of_2; + stride_of_2[0] = 2; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + +template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_larg_expr1D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr1D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr2D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr2D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr3D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr3D<float, ColMajor, int64_t>(sycl_device); + test_evals<float, ColMajor, int64_t>(sycl_device); + test_evals<float, RowMajor, int64_t>(sycl_device); + test_expr<float, ColMajor, int64_t>(sycl_device); + test_expr<float, RowMajor, int64_t>(sycl_device); + test_modes<float, ColMajor, int64_t>(sycl_device); + test_modes<float, RowMajor, int64_t>(sycl_device); + test_strides<float, ColMajor, int64_t>(sycl_device); + test_strides<float, RowMajor, int64_t>(sycl_device); +} + +void test_cxx11_tensor_convolution_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConvolutionPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 190dba862..3ecc68df0 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,35 +22,35 @@ #include <stdint.h> #include <iostream> -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_device_memory(const Eigen::SyclDevice &sycl_device) { std::cout << "Running on : " << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl; - int sizeDim1 = 100; - array<int, 1> tensorRange = {{sizeDim1}}; - Tensor<DataType, 1, DataLayout> in(tensorRange); - Tensor<DataType, 1, DataLayout> in1(tensorRange); + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorRange = {{sizeDim1}}; + Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange); memset(in1.data(), 1, in1.size() * sizeof(DataType)); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); - for (int i=0; i<in.size(); i++) { + for (IndexType i=0; i<in.size(); i++) { VERIFY_IS_EQUAL(in(i), in1(i)); } sycl_device.deallocate(gpu_in_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_device_exceptions(const Eigen::SyclDevice &sycl_device) { VERIFY(sycl_device.ok()); - int sizeDim1 = 100; - array<int, 1> tensorDims = {{sizeDim1}}; + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorDims = {{sizeDim1}}; DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType))); sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType)); - TensorMap<Tensor<DataType, 1, DataLayout>> in(gpu_data, tensorDims); - TensorMap<Tensor<DataType, 1, DataLayout>> out(gpu_data, tensorDims); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims); out.device(sycl_device) = in / in.constant(0); sycl_device.synchronize(); @@ -62,8 +62,8 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_device_memory<DataType, RowMajor>(sycl_device); - test_device_memory<DataType, ColMajor>(sycl_device); + test_device_memory<DataType, RowMajor, int64_t>(sycl_device); + test_device_memory<DataType, ColMajor, int64_t>(sycl_device); /// this test throw an exception. enable it if you want to see the exception //test_device_exceptions<DataType, RowMajor>(sycl_device); /// this test throw an exception. enable it if you want to see the exception diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 4d19a3b2a..aca036cde 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -14,23 +14,23 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 20; - int sizeDim3 = 20; - Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Eigen::Tensor<DataType, 3, DataLayout> in1(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout> in2(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 20; + IndexType sizeDim3 = 20; + Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); @@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { in2 = in2.random() + in2.constant(10.0f); // creating TensorMap from tensor - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k)); } @@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_forced_eval_sycl<DataType, RowMajor>(sycl_device); - test_forced_eval_sycl<DataType, ColMajor>(sycl_device); + test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_forced_eval_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 91353b81a..9b521bc6b 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,18 +28,18 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) { - typename Tensor<DataType, 5 ,DataLayout>::Dimensions dim1(2,3,1,7,1); - typename Tensor<DataType, 3 ,DataLayout>::Dimensions dim2(2,3,7); - typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim3(6,7); - typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim4(2,21); + typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1); + typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21); - Tensor<DataType, 5, DataLayout> tensor1(dim1); - Tensor<DataType, 3, DataLayout> tensor2(dim2); - Tensor<DataType, 2, DataLayout> tensor3(dim3); - Tensor<DataType, 2, DataLayout> tensor4(dim4); + Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1); + Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2); + Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3); + Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4); tensor1.setRandom(); @@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, dim1); - TensorMap<Tensor<DataType, 3,DataLayout>> gpu2(gpu_data2, dim2); - TensorMap<Tensor<DataType, 2,DataLayout>> gpu3(gpu_data3, dim3); - TensorMap<Tensor<DataType, 2,DataLayout>> gpu4(gpu_data4, dim4); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1); + TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4); sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); @@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4); sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor @@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) } -template<typename DataType, int DataLayout> +template<typename DataType, int DataLayout, typename IndexType> static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) { - typename Tensor<DataType, 3, DataLayout>::Dimensions dim1(2,3,7); - typename Tensor<DataType, 2, DataLayout>::Dimensions dim2(6,7); - typename Tensor<DataType, 5, DataLayout>::Dimensions dim3(2,3,1,7,1); - Tensor<DataType, 3, DataLayout> tensor(dim1); - Tensor<DataType, 2, DataLayout> tensor2d(dim2); - Tensor<DataType, 5, DataLayout> tensor5d(dim3); + typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7); + typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7); + typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1); + Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1); + Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2); + Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3); tensor.setRandom(); @@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType))); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType))); - TensorMap< Tensor<DataType, 3, DataLayout> > gpu1(gpu_data1, dim1); - TensorMap< Tensor<DataType, 2, DataLayout> > gpu2(gpu_data2, dim2); - TensorMap< Tensor<DataType, 5, DataLayout> > gpu3(gpu_data3, dim3); + TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1); + TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2); + TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); @@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor @@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_slice(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 2; - int sizeDim2 = 3; - int sizeDim3 = 5; - int sizeDim4 = 7; - int sizeDim5 = 11; - array<int, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 5,DataLayout> tensor(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange); tensor.setRandom(); - array<int, 5> slice1_range ={{1, 1, 1, 1, 1}}; - Tensor<DataType, 5,DataLayout> slice1(slice1_range); + array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; + Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu2(gpu_data2, slice1_range); - Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5); - Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); + Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); + Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.slice(indices, sizes); sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - array<int, 5> slice2_range ={{1,1,2,2,3}}; - Tensor<DataType, 5,DataLayout> slice2(slice2_range); + array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; + Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout>> gpu3(gpu_data3, slice2_range); - Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5); - Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); + Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); + Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 3; ++k) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); } } @@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); - for(int i=0;i<sizeDim1;i++) for(int j=0;j<sizeDim2;j++){ + for(IndexType i=0;i<sizeDim1;i++) + for(IndexType j=0;j<sizeDim2;j++){ VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j)); } sycl_device.deallocate(gpu_data1); @@ -230,12 +231,12 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_slice<DataType, RowMajor>(sycl_device); - test_simple_slice<DataType, ColMajor>(sycl_device); - test_simple_reshape<DataType, RowMajor>(sycl_device); - test_simple_reshape<DataType, ColMajor>(sycl_device); - test_reshape_as_lvalue<DataType, RowMajor>(sycl_device); - test_reshape_as_lvalue<DataType, ColMajor>(sycl_device); + test_simple_slice<DataType, RowMajor, int64_t>(sycl_device); + test_simple_slice<DataType, ColMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device); test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device); } diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp index 9e86e4b52..dc748b73e 100644 --- a/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.pad(paddings); sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); - for (int i = 0; i < padedSizeDim1; ++i) { - for (int j = 0; j < padedSizeDim2; ++j) { - for (int k = 0; k < padedSizeDim3; ++k) { - for (int l = 0; l < padedSizeDim4; ++l) { + for (IndexType i = 0; i < padedSizeDim1; ++i) { + for (IndexType j = 0; j < padedSizeDim2; ++j) { + for (IndexType k = 0; k < padedSizeDim3; ++k) { + for (IndexType l = 0; l < padedSizeDim4; ++l) { if (j >= 2 && j < 5 && k >= 3 && k < 8) { VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); } else { @@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 6; ++j) { - for (int k = 0; k < 12; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 6; ++j) { + for (IndexType k = 0; k < 12; ++k) { + for (IndexType l = 0; l < 7; ++l) { const float result_value = DataLayout == ColMajor ? result(i+2*j,k+12*l) : result(j+6*i,l+7*k); if (j >= 2 && j < 5 && k >= 3 && k < 8) { @@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_padding<DataType, RowMajor, int>(sycl_device); - test_simple_padding<DataType, ColMajor, int>(sycl_device); - test_padded_expr<DataType, RowMajor, int>(sycl_device); - test_padded_expr<DataType, ColMajor, int>(sycl_device); test_simple_padding<DataType, RowMajor, int64_t>(sycl_device); test_simple_padding<DataType, ColMajor, int64_t>(sycl_device); test_padded_expr<DataType, RowMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 941469029..440d48bca 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -14,97 +14,129 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -template <typename DataType, int DataLayout> -static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { - const int num_rows = 452; - const int num_cols = 765; - array<int, 2> tensorRange = {{num_rows, num_cols}}; + const IndexType num_rows = 452; + const IndexType num_cols = 765; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; - Tensor<DataType, 2, DataLayout> in(tensorRange); - Tensor<DataType, 0, DataLayout> full_redux; - Tensor<DataType, 0, DataLayout> full_redux_gpu; + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; in.setRandom(); - full_redux = in.sum(); + full_redux = in.mean(); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.sum(); + out_gpu.device(sycl_device) = in_gpu.mean(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { + + const IndexType num_rows = 876; + const IndexType num_cols = 953; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + + full_redux = in.minimum(); + + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template <typename DataType, int DataLayout> -static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - int dim_x = 145; - int dim_y = 1; - int dim_z = 67; - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { + + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; - array<int, 2> reduced_tensorRange = {{dim_y, dim_z}}; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; - Tensor<DataType, 3, DataLayout> in(tensorRange); - Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); - redux= in.sum(red_axis); + redux= in.maximum(red_axis); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) + for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) + for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template <typename DataType, int DataLayout> -static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { - int dim_x = 567; - int dim_y = 1; - int dim_z = 47; + IndexType dim_x = 567; + IndexType dim_y = 1; + IndexType dim_z = 47; - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 2; - array<int, 2> reduced_tensorRange = {{dim_x, dim_y}}; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; - Tensor<DataType, 3, DataLayout> in(tensorRange); - Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); @@ -113,15 +145,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) + for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) + for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); @@ -133,12 +165,14 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl:: QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_sycl<DataType, RowMajor>(sycl_device); - test_first_dim_reductions_sycl<DataType, RowMajor>(sycl_device); - test_last_dim_reductions_sycl<DataType, RowMajor>(sycl_device); - test_full_reductions_sycl<DataType, ColMajor>(sycl_device); - test_first_dim_reductions_sycl<DataType, ColMajor>(sycl_device); - test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device); + test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_reduction_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp new file mode 100644 index 000000000..2f5484484 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -0,0 +1,221 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { + + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; + + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) +{ + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; + + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange); + + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + + if (LValue) { + out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; + } else { + out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); + } + sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); + + + array<IndexType, 4> src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array<IndexType, 4> src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array<IndexType, 4> dst_slice_dim = src_slice_dim; + array<IndexType, 4> dst_slice_start = src_slice_start; + + for (IndexType i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + for (IndexType i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + + +template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){ + std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true); +} +void test_cxx11_tensor_reverse_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_reverse_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index c4521aac8..c88db7c72 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,20 +28,20 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout, typename IndexTypes> +template <typename DataType, int DataLayout, typename IndexType> static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { - IndexTypes sizeDim1 = 2; - IndexTypes sizeDim2 = 3; - IndexTypes sizeDim3 = 5; - IndexTypes sizeDim4 = 7; - array<IndexTypes, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexTypes> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexTypes> no_shuffle(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange); tensor.setRandom(); const size_t buffSize =tensor.size()*sizeof(DataType); - array<IndexTypes, 4> shuffles; + array<IndexType, 4> shuffles; shuffles[0] = 0; shuffles[1] = 1; shuffles[2] = 2; @@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); @@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3); VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); } } @@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - array<IndexTypes, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor<DataType, 4, DataLayout,IndexTypes> shuffle(tensorrangeShuffle); + array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle); DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4,DataLayout,IndexTypes>> gpu3(gpu_data3, tensorrangeShuffle); + TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle); gpu3.device(sycl_device)=gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); @@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2); VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); } } @@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_shuffling_sycl<DataType, RowMajor, int>(sycl_device); - test_simple_shuffling_sycl<DataType, ColMajor, int>(sycl_device); - test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device); test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device); diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp new file mode 100644 index 000000000..603c3746f --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -0,0 +1,203 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include <iostream> +#include <chrono> +#include <ctime> + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_striding(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + //no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; +//Tensor<float, 4, DataLayout> stride; +// stride = tensor.stride(strides); + + gpu_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (IndexType i = 0; i < 1; ++i) { + for (IndexType j = 0; j < 1; ++j) { + for (IndexType k = 0; k < 3; ++k) { + for (IndexType l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } + + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + //Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + +// Tensor<float, 4, DataLayout> result(3, 12, 10, 21); +// result.stride(strides) = tensor; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); + } + } + } + } + + array<IndexType, 4> no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; +// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21); +// result2.stride(strides) = tensor.stride(no_strides); + + gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); + } + } + } + } + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + + +template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_simple_striding<float, ColMajor, int64_t>(sycl_device); + test_simple_striding<float, RowMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device); +} + +void test_cxx11_tensor_striding_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorStridingPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index d5c0cbaad..5cd0f4c71 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -27,24 +27,24 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3, DataLayout> in1(tensorRange); - Tensor<DataType, 3, DataLayout> out1(tensorRange); - Tensor<DataType, 3, DataLayout> out2(tensorRange); - Tensor<DataType, 3, DataLayout> out3(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange); in1 = in1.random(); DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); @@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); @@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_data2); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { - int size = 20; - array<int, 1> tensorRange = {{size}}; - Tensor<DataType, 1, DataLayout> in1(tensorRange); - Tensor<DataType, 1, DataLayout> in2(tensorRange); - Tensor<DataType, 1, DataLayout> out(tensorRange); + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange); in1 = in1.random(); in2 = in1; DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 1, DataLayout>> gpu1(gpu_data, tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.synchronize(); in1.setZero(); @@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out(i), in2(i)); } sycl_device.deallocate(gpu_data); } -template <typename DataType, int DataLayout> +template <typename DataType, int DataLayout, typename IndexType> void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3,DataLayout> in1(tensorRange); - Tensor<DataType, 3,DataLayout> in2(tensorRange); - Tensor<DataType, 3,DataLayout> in3(tensorRange); - Tensor<DataType, 3,DataLayout> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); in2 = in2.random(); in3 = in3.random(); @@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType))); DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in3(gpu_in3_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(in1(i,j,k), 1.2f); } } @@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 1.2f); } @@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in2(i,j,k)); @@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k)); @@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in1(i,j,k)); @@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 3.14f + in2(i,j,k) * 2.7f); @@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k)); @@ -229,15 +229,44 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } +template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType> +static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange); + + in = in.random(); + + Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); + Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); + + TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange); + TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); + gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); + out_host = in. template cast<Scalar2>(); + for(IndexType i=0; i< size; i++) + { + VERIFY_IS_APPROX(out(i), out_host(i)); + } + printf("cast Test Passed\n"); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_sycl_mem_transfers<DataType, RowMajor>(sycl_device); - test_sycl_computations<DataType, RowMajor>(sycl_device); - test_sycl_mem_sync<DataType, RowMajor>(sycl_device); - test_sycl_mem_transfers<DataType, ColMajor>(sycl_device); - test_sycl_computations<DataType, ColMajor>(sycl_device); - test_sycl_mem_sync<DataType, ColMajor>(sycl_device); + test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device); } void test_cxx11_tensor_sycl() { |