From c0f2cb016e60b7dbde1d5946f42234a709a711f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Apr 2014 10:32:27 -0700 Subject: Extended support for Tensors: * Added ability to map a region of the memory to a tensor * Added basic support for unary and binary coefficient wise expressions, such as addition or square root * Provided an emulation layer to make it possible to compile the code with compilers (such as nvcc) that don't support cxx11. --- Eigen/src/Core/util/Macros.h | 5 + unsupported/Eigen/CXX11/Core | 14 +- unsupported/Eigen/CXX11/Tensor | 27 ++- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 24 ++- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 16 +- .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 184 +++++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 156 ++++++++--------- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 52 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 82 +++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 127 ++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 161 ++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 27 +++ unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 101 +++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 52 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 122 ++++++++++++++ unsupported/test/CMakeLists.txt | 5 +- unsupported/test/cxx11_tensor_simple.cpp | 2 +- 17 files changed, 1028 insertions(+), 129 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bfd6ba7de..3a928001e 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -121,6 +121,11 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support variadic templates? +#if __cplusplus > 199711L +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index 4dc4ab224..bba3d578d 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -21,20 +22,23 @@ * module. Note that at this stage, you should not need to include * this module directly. * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * * \code * #include * \endcode */ -#include - +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +#if __cplusplus <= 199711L +#include "src/Core/util/EmulateCXX11Meta.h" +#else #include "src/Core/util/CXX11Workarounds.h" #include "src/Core/util/CXX11Meta.h" +#endif #include #endif // EIGEN_CXX11_CORE_MODULE -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2c5129b3..f554c204a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -10,9 +10,10 @@ #ifndef EIGEN_CXX11_TENSOR_MODULE #define EIGEN_CXX11_TENSOR_MODULE -#include +#include "Eigen/src/Core/util/StaticAssert.h" +#include "unsupported/Eigen/CXX11/Core" -#include +#include "Eigen/src/Core/util/DisableStupidWarnings.h" /** \defgroup CXX11_Tensor_Module Tensor Module * @@ -27,13 +28,21 @@ #include #include -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" +#include "Eigen/Core" -#include +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" -#endif // EIGEN_CXX11_TENSOR_MODULE +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" +#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 618e2eb7b..47f06b1b5 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -317,7 +317,7 @@ constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts template constexpr inline Array h_array_reverse(Array arr, numeric_list) { - return {{std_array_get(arr)...}}; + return {{array_get(arr)...}}; } template @@ -335,9 +335,9 @@ constexpr inline std::array array_reverse(std::array arr) // an infinite loop) template struct h_array_reduce { - constexpr static inline auto run(std::array arr) -> decltype(Reducer::run(h_array_reduce::run(arr), std_array_get(arr))) + constexpr static inline auto run(std::array arr) -> decltype(Reducer::run(h_array_reduce::run(arr), array_get(arr))) { - return Reducer::run(h_array_reduce::run(arr), std_array_get(arr)); + return Reducer::run(h_array_reduce::run(arr), array_get(arr)); } }; @@ -346,7 +346,7 @@ struct h_array_reduce { constexpr static inline T run(std::array arr) { - return std_array_get<0>(arr); + return array_get<0>(arr); } }; @@ -375,7 +375,7 @@ constexpr inline auto array_prod(std::array arr) -> decltype(array_reduce< template constexpr inline std::array h_array_zip(std::array a, std::array b, numeric_list) { - return std::array{{ Op::run(std_array_get(a), std_array_get(b))... }}; + return std::array{{ Op::run(array_get(a), array_get(b))... }}; } template @@ -387,9 +387,9 @@ constexpr inline std::array array_zip(std::array< /* zip an array and reduce the result */ template -constexpr inline auto h_array_zip_and_reduce(std::array a, std::array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(std_array_get(a), std_array_get(b))...)) +constexpr inline auto h_array_zip_and_reduce(std::array a, std::array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) { - return reduce::type...>::run(Op::run(std_array_get(a), std_array_get(b))...); + return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); } template @@ -403,7 +403,7 @@ constexpr inline auto array_zip_and_reduce(std::array a, std::array template constexpr inline std::array h_array_apply(std::array a, numeric_list) { - return std::array{{ Op::run(std_array_get(a))... }}; + return std::array{{ Op::run(array_get(a))... }}; } template @@ -415,9 +415,9 @@ constexpr inline std::array array_apply(std::array -constexpr inline auto h_array_apply_and_reduce(std::array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(std_array_get(arr))...)) +constexpr inline auto h_array_apply_and_reduce(std::array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) { - return reduce::type...>::run(Op::run(std_array_get(arr))...); + return reduce::type...>::run(Op::run(array_get(arr))...); } template @@ -497,7 +497,3 @@ InstType instantiate_by_c_array(ArrType* arr) } // end namespace Eigen #endif // EIGEN_CXX11META_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 356ae10cf..77207f453 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -40,8 +40,18 @@ #error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) #endif +using std::array; + namespace Eigen { +// Use std::array as Eigen array +/*template +struct array : public std::array { + array() = default; + array(const std::initializer_list& a);// : std::array(a) {}; + array(const std::array& a); +};*/ + namespace internal { /* std::get is only constexpr in C++14, not yet in C++11 @@ -60,9 +70,9 @@ namespace internal { #define STD_GET_ARR_HACK std::template get(a) #endif -template constexpr inline T& std_array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } -template constexpr inline T&& std_array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } -template constexpr inline T const& std_array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } +template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } +template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } #undef STD_GET_ARR_HACK diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h new file mode 100644 index 000000000..76fcba5b4 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -0,0 +1,184 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_CXX11_META_H +#define EIGEN_EMULATE_CXX11_META_H + + +namespace Eigen { + +// The array class is only available starting with cxx11. Emulate our own here +// if needed +template class array { + public: + T& operator[] (size_t index) { return values[index]; } + const T& operator[] (size_t index) const { return values[index]; } + + T values[n]; +}; + + +namespace internal { + +/** \internal + * \file CXX11/Core/util/EmulateCXX11Meta.h + * This file emulates a subset of the functionality provided by CXXMeta.h for + * compilers that don't yet support cxx11 such as nvcc. + */ + +struct empty_list { static const std::size_t count = 0; }; + +template struct type_list { + T head; + Tail tail; + static const std::size_t count = 1 + Tail::count; +}; + +struct null_type { }; + +template +struct make_type_list { + typedef typename make_type_list::type tailresult; + + typedef type_list type; +}; + +template<> struct make_type_list<> { + typedef empty_list type; +}; + + + +template +struct type2val { + static const T value = n; +}; + + +template struct gen_numeric_list_repeated; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; +}; + + + +template +array repeat(t v) { + array array; + array.fill(v); + return array; +} + +template +t array_prod(const array& a) { + t prod = 1; + for (size_t i = 0; i < n; ++i) { prod *= a[i]; } + return prod; +} +template +t array_prod(const array& /*a*/) { + return 0; +} + +template inline T& array_get(array& a) { + return a[I]; +} +template inline const T& array_get(const array& a) { + return a[I]; +} + +struct sum_op { + template static inline bool run(A a, B b) { return a + b; } +}; +struct product_op { + template static inline bool run(A a, B b) { return a * b; } +}; + +struct logical_and_op { + template static inline bool run(A a, B b) { return a && b; } +}; +struct logical_or_op { + template static inline bool run(A a, B b) { return a || b; } +}; + +struct equal_op { + template static inline bool run(A a, B b) { return a == b; } +}; +struct not_equal_op { + template static inline bool run(A a, B b) { return a != b; } +}; +struct lesser_op { + template static inline bool run(A a, B b) { return a < b; } +}; +struct lesser_equal_op { + template static inline bool run(A a, B b) { return a <= b; } +}; + +struct greater_op { + template static inline bool run(A a, B b) { return a > b; } +}; +struct greater_equal_op { + template static inline bool run(A a, B b) { return a >= b; } +}; + +struct not_op { + template static inline bool run(A a) { return !a; } +}; +struct negation_op { + template static inline bool run(A a) { return -a; } +}; +struct greater_equal_zero_op { + template static inline bool run(A a) { return a >= 0; } +}; + + +template +inline bool array_apply_and_reduce(const array& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; +} + +template +inline bool array_zip_and_reduce(const array& a, const array& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; +} + +} // end namespace internal + +} // end namespace Eigen + + + +#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index c6216e14c..7b8f14c6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -57,28 +57,16 @@ namespace Eigen { * * \ref TopicStorageOrders */ -template -class Tensor; namespace internal { -template -struct traits> -{ - typedef Scalar_ Scalar; - typedef Dense StorageKind; - typedef DenseIndex Index; - enum { - Options = Options_ - }; -}; template struct tensor_index_linearization_helper { - constexpr static inline Index run(std::array const& indices, std::array const& dimensions) + static inline Index run(array const& indices, array const& dimensions) { - return std_array_get(indices) + - std_array_get(dimensions) * + return array_get(indices) + + array_get(dimensions) * tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -86,39 +74,40 @@ struct tensor_index_linearization_helper template struct tensor_index_linearization_helper { - constexpr static inline Index run(std::array const& indices, std::array const&) + static inline Index run(array const& indices, array const&) { - return std_array_get(indices); + return array_get(indices); } }; /* Forward-declaration required for the symmetry support. */ template class tensor_symmetry_value_setter; + } // end namespace internal template -class Tensor +class Tensor : public TensorBase > { - static_assert(NumIndices_ >= 1, "A tensor must have at least one index."); - public: typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; - typedef typename internal::traits::Scalar Scalar; + typedef Scalar_ Scalar; typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; - typedef Self DenseType; + typedef typename Base::CoeffReturnType CoeffReturnType; - constexpr static int Options = Options_; - constexpr static std::size_t NumIndices = NumIndices_; + static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; protected: TensorStorage m_storage; public: EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE std::array dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } @@ -129,29 +118,17 @@ class Tensor inline Self& base() { return *this; } inline const Self& base() const { return *this; } - void setZero() - { - // FIXME: until we have implemented packet access and the - // expression engine w.r.t. nullary ops, use this - // as a kludge. Only works with POD types, but for - // any standard usage, this shouldn't be a problem - memset((void *)data(), 0, size() * sizeof(Scalar)); - } - - inline Self& operator=(Self const& other) - { - m_storage = other.m_storage; - return *this; - } - +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeff(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline const Scalar& coeff(const std::array& indices) const + inline const Scalar& coeff(const array& indices) const { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; @@ -163,14 +140,17 @@ class Tensor return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeffRef(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline Scalar& coeffRef(const std::array& indices) + inline Scalar& coeffRef(const array& indices) { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; @@ -182,14 +162,17 @@ class Tensor return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return this->operator()(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline const Scalar& operator()(const std::array& indices) const + inline const Scalar& operator()(const array& indices) const { eigen_assert(checkIndexRange(indices)); return coeff(indices); @@ -203,18 +186,22 @@ class Tensor inline const Scalar& operator[](Index index) const { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); return coeff(index); } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline Scalar& operator()(const std::array& indices) + inline Scalar& operator()(const array& indices) { eigen_assert(checkIndexRange(indices)); return coeffRef(indices); @@ -228,47 +215,70 @@ class Tensor inline Scalar& operator[](Index index) { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) return coeffRef(index); } - inline Tensor() + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() : m_storage() { } - inline Tensor(const Self& other) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) : m_storage(other.m_storage) { } - inline Tensor(Self&& other) - : m_storage(other.m_storage) - { - } +#ifdef EIGEN_HAVE_RVALUE_REFERENCES +// inline Tensor(Self&& other) +// : m_storage(other.m_storage) +// { +// } +#endif +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) : m_storage() { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor."); - resize(std::array{{firstDimension, otherDimensions...}}); + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); } +#endif - inline Tensor(std::array dimensions) + inline Tensor(const array& dimensions) : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + // FIXME: we need to resize the tensor to fix the dimensions of the other. + // Unfortunately this isn't possible yet when the rhs is an expression. + // resize(other.dimensions()); + internal::TensorAssign::run(*this, other); + return *this; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template void resize(Index firstDimension, IndexTypes... otherDimensions) { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor."); - resize(std::array{{firstDimension, otherDimensions...}}); + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); } +#endif - void resize(const std::array& dimensions) + void resize(const array& dimensions) { std::size_t i; Index size = Index(1); @@ -285,20 +295,22 @@ class Tensor #endif } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, Index firstIndex, IndexTypes... otherIndices) { - return symCoeff(symmetry, std::array{{firstIndex, otherIndices...}}); + return symCoeff(symmetry, array{{firstIndex, otherIndices...}}); } template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, std::array const& indices) + internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, array const& indices) { return internal::tensor_symmetry_value_setter(*this, symmetry, indices); } +#endif protected: - bool checkIndexRange(const std::array& indices) const + bool checkIndexRange(const array& indices) const { using internal::array_apply_and_reduce; using internal::array_zip_and_reduce; @@ -313,7 +325,7 @@ class Tensor array_zip_and_reduce(indices, m_storage.dimensions()); } - inline Index linearizedIndex(const std::array& indices) const + inline Index linearizedIndex(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, m_storage.dimensions()); } @@ -322,7 +334,3 @@ class Tensor } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 000000000..f1df827f9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,52 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is responsible for triggering the evaluation of the expressions + * used on the lhs and rhs of an assignment operator and copy the result of + * the evaluation of the rhs expression at the address computed during the + * evaluation lhs expression. + * + * TODO: vectorization. For now the code only uses scalars + * TODO: parallelisation using multithreading on cpu, or kernels on gpu. + */ +namespace internal { + +template +struct TensorAssign +{ + typedef typename Derived1::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(Derived1& dst, const Derived2& src) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + for(Index i = 0; i < size; ++i) { + evalDst.coeffRef(i) = evalSrc.coeff(i); + } + } +}; + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 000000000..0b9f32f7f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ + +template +class TensorBase +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Index Index; + typedef Scalar CoeffReturnType; + + Derived& setZero() { + return setConstant(Scalar(0)); + } + + Derived& setConstant(const Scalar& val) { + Scalar* data = derived().data(); + for (int i = 0; i < derived().size(); ++i) { + data[i] = val; + } + return derived(); + } + + Derived& setRandom() { + Scalar* data = derived().data(); + for (int i = 0; i < derived().size(); ++i) { + data[i] = internal::random_default_impl::run(); + } + return derived(); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwiseSqrt() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwiseAbs() const { return derived(); } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + protected: + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 000000000..f4f10eff5 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,127 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + * TODO: add support for vectorization + */ + + +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar& CoeffReturnType; + //typedef typename Derived::PacketScalar PacketScalar; + typedef TensorEvaluator nestedType; + + TensorEvaluator(Derived& m) + : m_data(const_cast(m.data())) + { } + + CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + Scalar& coeffRef(Index index) { + return m_data[index]; + } + + // to do: vectorized evaluation. + /* template + PacketReturnType packet(Index index) const + { + return ploadt(m_data + index); + } + + template + void writePacket(Index index, const PacketScalar& x) + { + return pstoret(const_cast(m_data) + index, x); + }*/ + + protected: + Scalar* m_data; +}; + + + + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseUnaryOp XprType; + typedef TensorEvaluator nestedType; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + private: + const UnaryOp m_functor; + typename TensorEvaluator::nestedType m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseBinaryOp XprType; + typedef TensorEvaluator leftType; + typedef TensorEvaluator rightType; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression()), + m_rightImpl(op.rhsExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + + private: + const BinaryOp m_functor; + typename TensorEvaluator::nestedType m_leftImpl; + typename TensorEvaluator::nestedType m_rightImpl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 000000000..5a45cec31 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,161 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary operator + * (e.g. addition) is applied to a lhs and a rhs expression. + * + */ + +namespace internal { +template +struct traits > + : traits +{ + typedef typename result_of< + UnaryOp(typename XprType::Scalar) + >::type Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseUnaryOp +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + inline TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + nestedExpression() { return m_xpr.const_cast_derived(); } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename result_of< + BinaryOp( + typename LhsXprType::Scalar, + typename RhsXprType::Scalar + ) + >::type Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseBinaryOp +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + inline TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 000000000..dc97764f0 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,27 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +template class Tensor; +template class TensorMap; +template class TensorBase; + +template class TensorCwiseUnaryOp; +template class TensorCwiseBinaryOp; + +// Move to internal? +template struct TensorEvaluator; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 000000000..7dec1e08d --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,101 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +template class Stride; + + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ + +template class TensorMap : public TensorBase > +{ + public: + typedef TensorMap Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions({{firstDimension}}) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions({{firstDimension, otherDimensions...}}) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_dimensions); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar* data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const Index index = internal::tensor_index_linearization_helper::run(array{{firstIndex, otherIndices...}}, m_dimensions); + return m_data[index]; + } +#endif + + template + EIGEN_DEVICE_FUNC + Self& operator=(const OtherDerived& other) + { + internal::TensorAssign::run(*this, other); + return *this; + } + + private: + typename PlainObjectType::Scalar* m_data; + array m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index a34600ee6..503d7cfd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -37,14 +37,19 @@ template class TensorStorage : public TensorStorage::type> { - typedef TensorStorage::type> Base_; + typedef TensorStorage::type> Base_; + public: - TensorStorage() = default; - TensorStorage(const TensorStorage&) = default; - TensorStorage(TensorStorage&&) = default; + TensorStorage() { } + TensorStorage(const TensorStorage& other) : Base_(other) { } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES +// TensorStorage(TensorStorage&&) = default; +#endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} - TensorStorage(DenseIndex size, const std::array& dimensions) : Base_(size, dimensions) {} - TensorStorage& operator=(const TensorStorage&) = default; + TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} + + // TensorStorage& operator=(const TensorStorage&) = default; }; // pure dynamic @@ -52,17 +57,17 @@ template class TensorStorage::type> { T *m_data; - std::array m_dimensions; + array m_dimensions; typedef TensorStorage::type> Self_; public: - TensorStorage() : m_data(0), m_dimensions(internal::template repeat(0)) {} + TensorStorage() : m_data(0), m_dimensions() {} TensorStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_dimensions(internal::template repeat(0)) {} - TensorStorage(DenseIndex size, const std::array& dimensions) - : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) - { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } - TensorStorage(const Self_& other) + TensorStorage(DenseIndex size, const array& dimensions) + : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) + { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } + TensorStorage(const Self_& other) : m_data(internal::conditional_aligned_new_auto(internal::array_prod(other.m_dimensions))) , m_dimensions(other.m_dimensions) { @@ -76,28 +81,34 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - std::array dimensions(void) const {return m_dimensions;} - void conservativeResize(DenseIndex size, const std::array& nbDimensions) + const array& dimensions() const {return m_dimensions;} + + void conservativeResize(DenseIndex size, const array& nbDimensions) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, internal::array_prod(m_dimensions)); m_dimensions = nbDimensions; } - void resize(DenseIndex size, const std::array& nbDimensions) + void resize(DenseIndex size, const array& nbDimensions) { if(size != internal::array_prod(m_dimensions)) { @@ -110,8 +121,9 @@ class TensorStorage +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +namespace Eigen { +namespace internal { + + +template +class compute_tensor_flags +{ + enum { + is_dynamic_size_storage = 1, + + aligned_bit = + ( + ((Options&DontAlign)==0) && ( +#if EIGEN_ALIGN_STATICALLY + (!is_dynamic_size_storage) +#else + 0 +#endif + || +#if EIGEN_ALIGN + is_dynamic_size_storage +#else + 0 +#endif + ) + ) ? AlignedBit : 0, + packet_access_bit = packet_traits::Vectorizable && aligned_bit ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit | aligned_bit}; +}; + + +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret, + }; +}; + + +template +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; +}; + + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options_>& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options_>& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorMap& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorMap& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const Tensor& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const Tensor& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorMap& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorMap& type; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 0a6c56c19..31583d3ca 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -93,7 +93,7 @@ ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(bdcsvd) -option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." OFF) +option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON) if(EIGEN_TEST_CXX11) # FIXME: add C++11 compiler switch in some portable way # (MSVC doesn't need any for example, so this will @@ -101,4 +101,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") + ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_expr "-std=c++0x") + ei_add_test(cxx11_tensor_map "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index ea512c9cc..1f76033ea 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -163,7 +163,7 @@ static void test_3d() VERIFY_IS_EQUAL((epsilon(0,2,1)), -1); VERIFY_IS_EQUAL((epsilon(1,0,2)), -1); - std::array dims{{2,3,4}}; + array dims{{2,3,4}}; Tensor t1(dims); Tensor t2(dims); -- cgit v1.2.3 From 0320f7e3a71406b9a03d1bab0d168fd76e63d457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 6 May 2014 11:18:37 -0700 Subject: Added support for fixed sized tensors. Improved support for tensor expressions. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 +- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 12 +- .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 95 ++++++++- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 39 +--- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 212 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 232 +++++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 31 ++- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 46 +++- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 45 +++- unsupported/test/cxx11_tensor_assign.cpp | 195 +++++++++++++++++ unsupported/test/cxx11_tensor_expr.cpp | 145 +++++++++++++ unsupported/test/cxx11_tensor_fixed_size.cpp | 167 +++++++++++++++ unsupported/test/cxx11_tensor_map.cpp | 142 +++++++++++++ 18 files changed, 1319 insertions(+), 82 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h create mode 100644 unsupported/test/cxx11_tensor_assign.cpp create mode 100644 unsupported/test/cxx11_tensor_expr.cpp create mode 100644 unsupported/test/cxx11_tensor_fixed_size.cpp create mode 100644 unsupported/test/cxx11_tensor_map.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f554c204a..f2b18ef31 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -31,6 +31,7 @@ #include "Eigen/Core" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" @@ -41,6 +42,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 47f06b1b5..accaa94e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -112,7 +112,7 @@ template struct get<0, type_lis template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; -template struct get<0, numeric_list> { constexpr static int value = a; }; +template struct get<0, numeric_list> { constexpr static T value = a; }; template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 77207f453..f102872ae 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -17,9 +17,6 @@ #error Intel Compiler only supports required C++ features since version 13.1. // note that most stuff in principle works with 13.0 but when combining // some features, at some point 13.0 will just fail with an internal assertion -#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1)) -// note that it _should_ work with 3.1 but it was only tested with 3.2 -#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1. #elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) // G++ < 4.6 by default will continue processing the source files - even if we use #error to make // it error out. For this reason, we use the pragma to make sure G++ aborts at the first error @@ -40,17 +37,10 @@ #error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) #endif -using std::array; - namespace Eigen { // Use std::array as Eigen array -/*template -struct array : public std::array { - array() = default; - array(const std::initializer_list& a);// : std::array(a) {}; - array(const std::array& a); -};*/ +template using array = std::array; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 76fcba5b4..ab869177c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -11,16 +11,63 @@ #define EIGEN_EMULATE_CXX11_META_H + namespace Eigen { // The array class is only available starting with cxx11. Emulate our own here // if needed template class array { public: - T& operator[] (size_t index) { return values[index]; } - const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + array(std::initializer_list l) { + std::copy(l.begin(), l.end(), values); + } +#endif }; @@ -35,8 +82,10 @@ namespace internal { struct empty_list { static const std::size_t count = 0; }; template struct type_list { - T head; - Tail tail; + typedef T HeadType; + typedef Tail TailType; + static const T head; + static const Tail tail; static const std::size_t count = 1 + Tail::count; }; @@ -54,9 +103,25 @@ template<> struct make_type_list<> { }; +template struct get_type; + +template +struct get_type<0, type_list > +{ + typedef Head type; +}; +template +struct get_type > +{ + typedef typename get_type::type type; +}; + + +/* numeric list */ template struct type2val { + typedef T type; static const T value = n; }; @@ -84,6 +149,28 @@ template struct gen_numeric_list_repeated { }; +template struct get; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template +struct get > +{ + typedef typename get::type type; + static const type value = get::value; +}; + +template struct arg_prod { + static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; +}; +template <> struct arg_prod { + static const int value = 1; +}; template array repeat(t v) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 7b8f14c6d..f5c027d1c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -60,26 +60,6 @@ namespace Eigen { namespace internal { -template -struct tensor_index_linearization_helper -{ - static inline Index run(array const& indices, array const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_index_linearization_helper -{ - static inline Index run(array const& indices, array const&) - { - return array_get(indices); - } -}; - /* Forward-declaration required for the symmetry support. */ template class tensor_symmetry_value_setter; @@ -102,13 +82,15 @@ class Tensor : public TensorBase > static const int Options = Options_; static const std::size_t NumIndices = NumIndices_; + typedef DSizes Dimensions; + protected: TensorStorage m_storage; public: EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); } + EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } @@ -232,13 +214,6 @@ class Tensor : public TensorBase > { } -#ifdef EIGEN_HAVE_RVALUE_REFERENCES -// inline Tensor(Self&& other) -// : m_storage(other.m_storage) -// { -// } -#endif - #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) @@ -327,7 +302,11 @@ class Tensor : public TensorBase > inline Index linearizedIndex(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, m_storage.dimensions()); + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0b9f32f7f..9c7783aaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -62,6 +62,20 @@ class TensorBase EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> cwiseAbs() const { return derived(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwisePow(Scalar exponent) const { + return TensorCwiseUnaryOp, const Derived> + (derived(), internal::scalar_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator * (Scalar scale) const { + return TensorCwiseUnaryOp, const Derived> + (derived(), internal::scalar_multiple_op(scale)); + } + // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 000000000..bd3bd5aca --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + + + +// Boiler plate code +namespace internal { + +template struct dget { + static const std::size_t value = internal::get::value; + }; + + +template +struct fixed_size_tensor_index_linearization_helper +{ + template + static inline Index run(array const& indices, + const Dimensions& dimensions) + { + return array_get(indices) + + dget::value * + fixed_size_tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper +{ + template + static inline Index run(array const& indices, + const Dimensions&) + { + return array_get(indices); + } +}; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct Sizes : internal::numeric_list { + typedef internal::numeric_list Base; + static const std::size_t total_size = internal::arg_prod(Indices...); + + static std::size_t TotalSize() { + return internal::arg_prod(Indices...); + } + + Sizes() { } + template + explicit Sizes(const array&/* indices*/) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + explicit Sizes(std::initializer_list/* l*/) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T&/* other*/) { + // add assertion failure if the size of other is different + return *this; + } + + template + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + template + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + +#else + +template +struct non_zero_size { + typedef internal::type2val type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template struct Sizes { + typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod::value; + + static const size_t TotalSize() { + return internal::arg_prod::value; + } + + Sizes() { } + template + explicit Sizes(const array& indices) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + explicit Sizes(std::initializer_list l) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T& other) { + // to do: check the size of other + return *this; + } + + template + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + } + template + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + } +}; + +#endif + +// Boiler plate +namespace internal { +template +struct tensor_index_linearization_helper +{ + static inline Index run(array const& indices, array const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper +{ + static inline Index run(array const& indices, array const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + + size_t TotalSize() const { + return internal::array_prod(*static_cast(this)); + } + + DSizes() { } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + // explicit DSizes(std::initializer_list l) : Base(l) { } +#endif + explicit DSizes(const array& a) : Base(a) { } + + DSizes& operator = (const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + size_t IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + size_t IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f4f10eff5..b0dbca041 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -24,15 +24,12 @@ namespace Eigen { * TODO: add support for vectorization */ - template struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; typedef typename Derived::Scalar& CoeffReturnType; - //typedef typename Derived::PacketScalar PacketScalar; - typedef TensorEvaluator nestedType; TensorEvaluator(Derived& m) : m_data(const_cast(m.data())) @@ -72,7 +69,6 @@ template struct TensorEvaluator > { typedef TensorCwiseUnaryOp XprType; - typedef TensorEvaluator nestedType; TensorEvaluator(const XprType& op) : m_functor(op.functor()), @@ -89,7 +85,7 @@ struct TensorEvaluator > private: const UnaryOp m_functor; - typename TensorEvaluator::nestedType m_argImpl; + TensorEvaluator m_argImpl; }; @@ -99,8 +95,6 @@ template struct TensorEvaluator > { typedef TensorCwiseBinaryOp XprType; - typedef TensorEvaluator leftType; - typedef TensorEvaluator rightType; TensorEvaluator(const XprType& op) : m_functor(op.functor()), @@ -118,8 +112,8 @@ struct TensorEvaluator::nestedType m_leftImpl; - typename TensorEvaluator::nestedType m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 5a45cec31..aa875dc31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -54,7 +54,7 @@ struct nested, 1, typename eval -class TensorCwiseUnaryOp +class TensorCwiseUnaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -75,11 +75,6 @@ class TensorCwiseUnaryOp const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_xpr.const_cast_derived(); } - protected: typename XprType::Nested m_xpr; const UnaryOp m_functor; @@ -124,7 +119,7 @@ struct nested, 1, typename template -class TensorCwiseBinaryOp +class TensorCwiseBinaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 000000000..953880123 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixes sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > +{ + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES + inline TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + internal::TensorAssign::run(*this, other); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index dc97764f0..e8a2125c4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -13,6 +13,7 @@ namespace Eigen { template class Tensor; +template class TensorFixedSize; template class TensorMap; template class TensorBase; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 7dec1e08d..bb0b39c5a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -43,24 +43,38 @@ template class TensorMap : public TensorBase({{firstDimension}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions({{firstDimension, otherDimensions...}}) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif + inline TensorMap(PointerArgType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_dimensions); } + EIGEN_STRONG_INLINE const typename PlainObjectType::Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_data; } EIGEN_DEVICE_FUNC @@ -78,8 +92,13 @@ template class TensorMap : public TensorBase::run(array{{firstIndex, otherIndices...}}, m_dimensions); - return m_data[index]; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } } #endif @@ -93,7 +112,7 @@ template class TensorMap : public TensorBase m_dimensions; + typename PlainObjectType::Dimensions m_dimensions; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 503d7cfd6..efcb39559 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -32,6 +32,35 @@ namespace Eigen { */ template class TensorStorage; + +// Pure fixed-size storage +template +class TensorStorage +{ + private: + T m_data[Size]; + FixedDimensions m_dimensions; + + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStorage() { + EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; + + + // pure-dynamic, but without specification of all dimensions explicitly template class TensorStorage @@ -44,7 +73,7 @@ class TensorStorage TensorStorage(const TensorStorage& other) : Base_(other) { } #ifdef EIGEN_HAVE_RVALUE_REFERENCES -// TensorStorage(TensorStorage&&) = default; + // TensorStorage(TensorStorage&&) = default; #endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} @@ -57,11 +86,11 @@ template class TensorStorage::type> { T *m_data; - array m_dimensions; + DSizes m_dimensions; typedef TensorStorage::type> Self_; public: - TensorStorage() : m_data(0), m_dimensions() {} + TensorStorage() : m_data(0), m_dimensions() {} TensorStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_dimensions(internal::template repeat(0)) {} TensorStorage(DenseIndex size, const array& dimensions) @@ -83,25 +112,25 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - const array& dimensions() const {return m_dimensions;} + const DSizes& dimensions() const {return m_dimensions;} void conservativeResize(DenseIndex size, const array& nbDimensions) { @@ -124,9 +153,10 @@ class TensorStorage > }; +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; +}; + + template struct traits > : public traits @@ -68,16 +77,28 @@ struct traits > }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options>& type; +}; + +template +struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options>& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options_>& type; + typedef const TensorFixedSize& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorFixedSize& type; }; template @@ -104,6 +125,18 @@ struct nested, 1, typename eval& type; }; +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorFixedSize& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorFixedSize& type; +}; + template struct nested, 1, typename eval >::type> { diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp new file mode 100644 index 000000000..c88872950 --- /dev/null +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -0,0 +1,195 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1(6); + Tensor vec2(6); + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap> vec3(col_major, 6); + TensorMap> vec4(row_major, 6); + + vec3 = vec1; + vec4 = vec2; + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); + + vec1.setZero(); + vec2.setZero(); + vec1 = vec3; + vec2 = vec4; + + VERIFY_IS_EQUAL(vec1(0), 4); + VERIFY_IS_EQUAL(vec1(1), 8); + VERIFY_IS_EQUAL(vec1(2), 15); + VERIFY_IS_EQUAL(vec1(3), 16); + VERIFY_IS_EQUAL(vec1(4), 23); + VERIFY_IS_EQUAL(vec1(5), 42); + + VERIFY_IS_EQUAL(vec2(0), 0); + VERIFY_IS_EQUAL(vec2(1), 1); + VERIFY_IS_EQUAL(vec2(2), 2); + VERIFY_IS_EQUAL(vec2(3), 3); + VERIFY_IS_EQUAL(vec2(4), 4); + VERIFY_IS_EQUAL(vec2(5), 5); +} + +static void test_2d() +{ + Tensor mat1(2,3); + Tensor mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap> mat3(row_major, 2, 3); + TensorMap> mat4(col_major, 2, 3); + + mat3 = mat1; + mat4 = mat2; + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + VERIFY_IS_EQUAL(mat1(0,0), 0); + VERIFY_IS_EQUAL(mat1(0,1), 1); + VERIFY_IS_EQUAL(mat1(0,2), 2); + VERIFY_IS_EQUAL(mat1(1,0), 3); + VERIFY_IS_EQUAL(mat1(1,1), 4); + VERIFY_IS_EQUAL(mat1(1,2), 5); + + VERIFY_IS_EQUAL(mat2(0,0), 0); + VERIFY_IS_EQUAL(mat2(0,1), 1); + VERIFY_IS_EQUAL(mat2(0,2), 2); + VERIFY_IS_EQUAL(mat2(1,0), 3); + VERIFY_IS_EQUAL(mat2(1,1), 4); + VERIFY_IS_EQUAL(mat2(1,2), 5); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + int col_major[2*3*7]; + int row_major[2*3*7]; + memset(col_major, 0, 2*3*7*sizeof(int)); + memset(row_major, 0, 2*3*7*sizeof(int)); + TensorMap> mat3(col_major, 2, 3, 7); + TensorMap> mat4(row_major, 2, 3, 7); + + mat3 = mat1; + mat4 = mat2; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat1(i,j,k), val); + VERIFY_IS_EQUAL(mat2(i,j,k), val); + val++; + } + } + } +} + + +void test_cxx11_tensor_assign() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp new file mode 100644 index 000000000..e0124da8c --- /dev/null +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -0,0 +1,145 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1({6}); + Tensor vec2({6}); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap> vec3(data3, 6); + vec3 = vec1.cwiseSqrt(); + float data4[6]; + TensorMap> vec4(data4, 6); + vec4 = vec2.cwiseSqrt(); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); + VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); + VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); + VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); + VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); + VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + Tensor mat3(2,3); + Tensor mat4(2,3); + mat3 = mat1.cwiseAbs(); + mat4 = mat2.cwiseAbs(); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + Tensor mat3(2,3,7); + mat3 = mat1 + mat1; + Tensor mat4(2,3,7); + mat4 = mat2 * 3.14f; + Tensor mat5(2,3,7); + mat5 = mat1.cwiseSqrt().cwiseSqrt(); + Tensor mat6(2,3,7); + mat6 = mat2.cwiseSqrt() * 3.14f; + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), val + val); + VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); + VERIFY_IS_APPROX(mat5(i,j,k), sqrtf(sqrtf(val))); + VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); + val += 1.0; + } + } + } +} + + +void test_cxx11_tensor_expr() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp new file mode 100644 index 000000000..c1d74d881 --- /dev/null +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_1d() +{ + TensorFixedSize > vec1; + TensorFixedSize, RowMajor> vec2; + + VERIFY_IS_EQUAL((vec1.size()), 6); + // VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec1.dimension(0)), 6); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap > > vec3(data3, 6); + vec3 = vec1.cwiseSqrt(); + float data4[6]; + TensorMap, RowMajor> > vec4(data4, 6); + vec4 = vec2.cwiseSqrt(); + + VERIFY_IS_EQUAL((vec3.size()), 6); + // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec3.dimension(0)), 6); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); + VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); + VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); + VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); + VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); + VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap >> mat1(data1,2,3); + float data2[6]; + TensorMap, RowMajor>> mat2(data2,2,3); + + VERIFY_IS_EQUAL((mat1.size()), 2*3); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + TensorFixedSize> mat3; + TensorFixedSize, RowMajor> mat4; + mat3 = mat1.cwiseAbs(); + mat4 = mat2.cwiseAbs(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + TensorFixedSize > mat1; + TensorFixedSize, RowMajor> mat2; + + VERIFY_IS_EQUAL((mat1.size()), 2*3*7); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); + + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + TensorFixedSize > mat3; + mat3 = mat1.cwiseSqrt(); + TensorFixedSize, RowMajor> mat4; + mat4 = mat2.cwiseSqrt(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3*7); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat3.dimension(2)), 7); + + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val)); + VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val)); + val += 1.0; + } + } + } +} + + +void test_cxx11_tensor_fixed_size() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp new file mode 100644 index 000000000..478c20306 --- /dev/null +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1(6); + Tensor vec2(6); + + TensorMap> vec3(vec1.data(), 6); + TensorMap> vec4(vec2.data(), 6); + + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + VERIFY_IS_EQUAL(vec1.size(), 6); + VERIFY_IS_EQUAL(vec1.dimension(0), 6); + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); +} + +static void test_2d() +{ + Tensor mat1(2,3); + Tensor mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + TensorMap> mat3(mat1.data(), 2, 3); + TensorMap> mat4(mat2.data(), 2, 3); + + VERIFY_IS_EQUAL(mat3.size(), 6); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + + VERIFY_IS_EQUAL(mat4.size(), 6); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + TensorMap> mat3(mat1.data(), 2, 3, 7); + TensorMap> mat4(mat2.data(), 2, 3, 7); + + VERIFY_IS_EQUAL(mat3.size(), 2*3*7); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + VERIFY_IS_EQUAL(mat3.dimension(2), 7); + + VERIFY_IS_EQUAL(mat4.size(), 2*3*7); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + VERIFY_IS_EQUAL(mat4.dimension(2), 7); + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } +} + + +void test_cxx11_tensor_map() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} -- cgit v1.2.3 From 7402fea0a8e63e3ea248257047c584afee8f8bde Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 16 May 2014 15:08:05 -0700 Subject: Vectorized the evaluation of tensor expression (using SSE, AVX, NEON, ...) Added the ability to parallelize the evaluation of a tensor expression over multiple cpu cores. Added the ability to offload the evaluation of a tensor expression to a GPU. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 145 ++++++++++++++++++- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 12 ++ unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 83 +++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 56 ++++++++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 54 +++++-- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 27 ++-- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 10 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 158 +++++++++++++++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 19 --- unsupported/test/CMakeLists.txt | 3 + unsupported/test/cxx11_tensor_device.cpp | 126 ++++++++++++++++ unsupported/test/cxx11_tensor_fixed_size.cpp | 28 ++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 37 +++++ 17 files changed, 720 insertions(+), 66 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h create mode 100644 unsupported/test/cxx11_tensor_device.cpp create mode 100644 unsupported/test/cxx11_tensor_thread_pool.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2b18ef31..323d9edff 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -31,6 +31,7 @@ #include "Eigen/Core" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" @@ -39,6 +40,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index f5c027d1c..d8ff3f584 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -75,9 +75,15 @@ class Tensor : public TensorBase > typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::PacketReturnType PacketReturnType; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = true, + }; static const int Options = Options_; static const std::size_t NumIndices = NumIndices_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f1df827f9..e69ff6188 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -10,6 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#ifdef EIGEN_USE_THREADS +#include +#endif namespace Eigen { @@ -28,7 +31,8 @@ namespace Eigen { */ namespace internal { -template +// Default strategy: the expressions are evaluated with a single cpu thread. +template::PacketAccess & TensorEvaluator::PacketAccess> struct TensorAssign { typedef typename Derived1::Index Index; @@ -38,13 +42,150 @@ struct TensorAssign TensorEvaluator evalDst(dst); TensorEvaluator evalSrc(src); const Index size = dst.size(); - for(Index i = 0; i < size; ++i) { + for (Index i = 0; i < size; ++i) { + evalDst.coeffRef(i) = evalSrc.coeff(i); + } + } +}; + + +template +struct TensorAssign +{ + typedef typename Derived1::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(Derived1& dst, const Derived2& src) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + static const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evalDst.template writePacket(i, evalSrc.template packet(i)); + } + for (Index i = VectorizedSize; i < size; ++i) { evalDst.coeffRef(i) = evalSrc.coeff(i); } } }; + +// Multicore strategy: the index space is partitioned and each core is assigned to a partition +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + dst.coeffRef(i) = src.coeff(i); + } + } +}; + +template +struct EvalRange { + static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits::size; + if (last - first > PacketSize) { + static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned; + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + dst.template writePacket(i, src.template packet(i)); + } + } + + for (; i < last; ++i) { + dst.coeffRef(i) = src.coeff(i); + } + } +}; + +template +struct TensorAssignMultiThreaded +{ + typedef typename Derived1::Index Index; + static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + + static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + + int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); + } + + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange, TensorEvaluator, Index>::run(evalDst, evalSrc, numblocks * blocksize, size); + } + } +}; +#endif + + +// GPU: the evaluation of the expressions is offloaded to a GPU. +#ifdef EIGEN_USE_GPU +template +__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + evalDst.coeffRef(index) = evalSrc.coeff(index); +} +template +__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) { + const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + evalDst.coeffRef(index) = evalSrc.coeff(index); + } +} + +template +struct TensorAssignGpu +{ + typedef typename Derived1::Index Index; + static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + const int block_size = std::min(size, 32*32); + const int num_blocks = size / block_size; + EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); + + const int remaining_items = size % block_size; + if (remaining_items > 0) { + const int peel_start_offset = num_blocks * block_size; + const int peel_block_size = std::min(size, 32); + const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; + EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); + } + } +}; +#endif + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 9c7783aaf..fa1bd3498 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -28,6 +28,7 @@ class TensorBase typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::Index Index; typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; Derived& setZero() { return setConstant(Scalar(0)); @@ -83,6 +84,17 @@ class TensorBase return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template + TensorDevice device(const DeviceType& device) { + return TensorDevice(device, derived()); + } + protected: template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 000000000..71890e187 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,83 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: thread pools. + * Todo: operator +=, -=, *= and so on. + */ + +template class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssign::run(m_expression, other); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + + +#ifdef EIGEN_USE_THREADS +template class TensorDevice { + public: + TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssignMultiThreaded::run(m_expression, other, m_device); + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +#ifdef EIGEN_USE_GPU +template class TensorDevice +{ + public: + TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssignGpu::run(m_expression, other, m_device); + return *this; + } + + protected: + const GpuDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h new file mode 100644 index 000000000..ded6ca604 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { +}; + + +// Multiple cpu cores +// We should really use a thread pool here but first we need to find a portable thread pool library. +#ifdef EIGEN_USE_THREADS +struct ThreadPoolDevice { + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + size_t numThreads() const { return num_threads_; } + /*ThreadPool* threadPool() const { return pool_; }*/ + + private: + // todo: NUMA, ... + size_t num_threads_; + /*ThreadPool* pool_;*/ +}; +#endif + + +// GPU offloading +#ifdef EIGEN_USE_GPU +struct GpuDevice { + // todo: support for multiple gpu; + GpuDevice() { + cudaStreamCreate(&stream_); + } + ~GpuDevice() { + cudaStreamDestroy(stream_); + } + const cudaStream_t& stream() const { return stream_; } + + private: + cudaStream_t stream_; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index bd3bd5aca..43e9d6550 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -79,16 +79,16 @@ struct Sizes : internal::numeric_list { Sizes() { } template - explicit Sizes(const array&/* indices*/) { + explicit Sizes(const array& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - explicit Sizes(std::initializer_list/* l*/) { + explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } #endif - template Sizes& operator = (const T&/* other*/) { + template Sizes& operator = (const T& /*other*/) { // add assertion failure if the size of other is different return *this; } @@ -119,7 +119,7 @@ template ::value; - static const size_t TotalSize() { + static size_t TotalSize() { return internal::arg_prod::value; } @@ -181,14 +181,11 @@ template struct DSizes : array { typedef array Base; - size_t TotalSize() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } DSizes() { } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES - // explicit DSizes(std::initializer_list l) : Base(l) { } -#endif explicit DSizes(const array& a) : Base(a) { } DSizes& operator = (const array& other) { @@ -203,7 +200,6 @@ struct DSizes : array { size_t IndexOfRowMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b0dbca041..3ce924dc3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -29,32 +29,38 @@ struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar& CoeffReturnType; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + }; TensorEvaluator(Derived& m) : m_data(const_cast(m.data())) { } - CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_data[index]; } - Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) { return m_data[index]; } - // to do: vectorized evaluation. - /* template + template PacketReturnType packet(Index index) const { - return ploadt(m_data + index); + return internal::ploadt(m_data + index); } - template - void writePacket(Index index, const PacketScalar& x) + template + void writePacket(Index index, const Packet& x) { - return pstoret(const_cast(m_data) + index, x); - }*/ + return internal::pstoret(m_data + index, x); + } protected: Scalar* m_data; @@ -70,6 +76,11 @@ struct TensorEvaluator > { typedef TensorCwiseUnaryOp XprType; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + }; + TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) @@ -77,12 +88,19 @@ struct TensorEvaluator > typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; - CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + private: const UnaryOp m_functor; TensorEvaluator m_argImpl; @@ -96,6 +114,12 @@ struct TensorEvaluator XprType; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + }; + TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_leftImpl(op.lhsExpression()), @@ -104,11 +128,17 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); + } private: const BinaryOp m_functor; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index aa875dc31..e32077f6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -33,6 +33,9 @@ struct traits > typedef typename result_of< UnaryOp(typename XprType::Scalar) >::type Scalar; + typedef typename result_of< + UnaryOp(typename XprType::Packet) + >::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -57,14 +60,16 @@ template class TensorCwiseUnaryOp : public TensorBase > { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - inline TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} EIGEN_DEVICE_FUNC @@ -92,6 +97,7 @@ struct traits > typename RhsXprType::Scalar ) >::type Scalar; + typedef typename internal::packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -123,14 +129,17 @@ class TensorCwiseBinaryOp : public TensorBase::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - inline TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 953880123..dcc7ccd65 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -33,11 +33,17 @@ class TensorFixedSize : public TensorBase::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; - static const int Options = Options_; + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = true, + }; + typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index e8a2125c4..09b0fe66d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -14,12 +14,14 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; -template class TensorMap; +template class TensorMap; template class TensorBase; template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; +template class TensorDevice; + // Move to internal? template struct TensorEvaluator; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index bb0b39c5a..3fc9c5335 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -22,16 +22,16 @@ template class Strid * */ -template class TensorMap : public TensorBase > +template class TensorMap : public TensorBase > { public: - typedef TensorMap Self; + typedef TensorMap Self; typedef typename PlainObjectType::Base Base; typedef typename Eigen::internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; @@ -43,13 +43,12 @@ template class TensorMap : public TensorBase({{firstDimension}})) { @@ -65,7 +64,7 @@ template class TensorMap : public TensorBase& dimensions) + inline TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -80,12 +79,97 @@ template class TensorMap : public TensorBase& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } + } +#else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC @@ -100,8 +184,60 @@ template class TensorMap : public TensorBase= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } #endif + template EIGEN_DEVICE_FUNC Self& operator=(const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index efcb39559..64098343e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -72,9 +72,6 @@ class TensorStorage TensorStorage() { } TensorStorage(const TensorStorage& other) : Base_(other) { } -#ifdef EIGEN_HAVE_RVALUE_REFERENCES - // TensorStorage(TensorStorage&&) = default; -#endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} @@ -111,22 +108,6 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 31583d3ca..abc3375e5 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -104,4 +104,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp new file mode 100644 index 000000000..9eb1d0420 --- /dev/null +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_device +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +// Context for evaluation on cpu +struct CPUContext { + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out) { } + + const Eigen::Tensor& in1() const { return in1_; } + const Eigen::Tensor& in2() const { return in2_; } + Eigen::TensorDevice, Eigen::DefaultDevice> out() { return TensorDevice, Eigen::DefaultDevice>(cpu_device_, out_); } + + private: + const Eigen::Tensor& in1_; + const Eigen::Tensor& in2_; + Eigen::Tensor& out_; + + Eigen::DefaultDevice cpu_device_; +}; + + +// Context for evaluation on GPU +struct GPUContext { + GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out) { } + + const Eigen::TensorMap >& in1() const { return in1_; } + const Eigen::TensorMap >& in2() const { return in2_; } + Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } + + private: + const Eigen::TensorMap >& in1_; + const Eigen::TensorMap >& in2_; + Eigen::TensorMap >& out_; + Eigen::GpuDevice gpu_device_; +}; + + +// The actual expression to evaluate +template +static void test_contextual_eval(Context* context) +{ + context->out() = context->in1() + context->in2() * 3.14f; +} + +static void test_cpu() { + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + + in1.setRandom(); + in2.setRandom(); + CPUContext context(in1, in2, out); + test_contextual_eval(&context); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} + +static void test_gpu() { + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(2,3,7)); + + GPUContext context(gpu_in1, gpu_in2, gpu_out); + test_contextual_eval(&context); + + cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} + + + +void test_cxx11_tensor_device() +{ + CALL_SUBTEST(test_cpu()); + CALL_SUBTEST(test_gpu()); +} diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index c1d74d881..214f6951d 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -159,9 +159,37 @@ static void test_3d() } +static void test_array() +{ + TensorFixedSize > mat1; + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(array(i,j,k)) = val; + val += 1.0; + } + } + } + + TensorFixedSize > mat3; + mat3 = mat1.cwisePow(3.5f); + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(array(i,j,k)), powf(val, 3.5f)); + val += 1.0; + } + } + } +} + void test_cxx11_tensor_fixed_size() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_array()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp new file mode 100644 index 000000000..c9de71da3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -0,0 +1,37 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cxx11_tensor_thread_pool() +{ + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(3); + out.device(thread_pool_device) = in1 + in2 * 3.14; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} -- cgit v1.2.3 From 736267cf6b17832a571acf7e34ca07c7f55907ee Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 22 May 2014 16:22:35 -0700 Subject: Added support for additional tensor operations: * comparison (<, <=, ==, !=, ...) * selection * nullary ops such as random or constant generation * misc unary ops such as log(), exp(), or a user defined unaryExpr() Cleaned up the code a little. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 139 ++++++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 84 +++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 109 ++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 36 +++--- 5 files changed, 339 insertions(+), 31 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fa1bd3498..8a88ba806 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -33,21 +33,25 @@ class TensorBase Derived& setZero() { return setConstant(Scalar(0)); } - Derived& setConstant(const Scalar& val) { - Scalar* data = derived().data(); - for (int i = 0; i < derived().size(); ++i) { - data[i] = val; - } - return derived(); + return derived() = constant(val); } - Derived& setRandom() { - Scalar* data = derived().data(); - for (int i = 0; i < derived().size(); ++i) { - data[i] = internal::random_default_impl::run(); - } - return derived(); + return derived() = random(); + } + + // Nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return TensorCwiseNullaryOp, const Derived> + (internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return TensorCwiseNullaryOp, const Derived>(); } // Coefficient-wise unary operators @@ -57,15 +61,31 @@ class TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwiseSqrt() const { return derived(); } + sqrt() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { return derived(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwiseAbs() const { return derived(); } + abs() const { return derived(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwisePow(Scalar exponent) const { + pow(Scalar exponent) const { return TensorCwiseUnaryOp, const Derived> (derived(), internal::scalar_pow_op(exponent)); } @@ -77,6 +97,30 @@ class TensorBase (derived(), internal::scalar_multiple_op(scale)); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cast() const { + return derived(); + } + // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -90,6 +134,71 @@ class TensorBase return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + // Coefficient-wise ternary operators. + template + inline const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const{ + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { return TensorDevice(device, derived()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 3ce924dc3..e0c0863b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -68,6 +68,42 @@ struct TensorEvaluator +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseNullaryOp XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess, + }; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(index); + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(index); + } + + private: + const NullaryOp m_functor; +}; + + // -------------------- CwiseUnaryOp -------------------- @@ -146,6 +182,54 @@ struct TensorEvaluator m_rightImpl; }; + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorSelectOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & + TensorEvaluator::PacketAccess*/, + }; + + TensorEvaluator(const XprType& op) + : m_condImpl(op.ifExpression()), + m_thenImpl(op.thenExpression()), + m_elseImpl(op.elseExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + static const int PacketSize = internal::unpacket_traits::size; + internal::Selector select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index e32077f6e..94cfae05c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -17,6 +17,9 @@ namespace Eigen { * * \brief Tensor expression classes. * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. This + * is typically used to generate constants. + * * The TensorCwiseUnaryOp class represents an expression where a unary operator * (e.g. cwiseSqrt) is applied to an expression. * @@ -24,6 +27,46 @@ namespace Eigen { * (e.g. addition) is applied to a lhs and a rhs expression. * */ +namespace internal { +template +struct traits > + : traits +{ + typedef typename PlainObjectType::Packet Packet; + typedef typename PlainObjectType::Scalar Scalar; + typedef typename PlainObjectType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; +}; + +} // end namespace internal + + + +template +class TensorCwiseNullaryOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; + typedef typename PlainObjectType::PacketReturnType PacketReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const NullaryOp& func = NullaryOp()) + : m_functor(func) {} + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + // todo: add tensor dimension to be able to do some sanity checks + const NullaryOp m_functor; +}; + + namespace internal { template @@ -160,6 +203,72 @@ class TensorCwiseBinaryOp : public TensorBase +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSelectOp type; +}; + +} // end namespace internal + + +template +class TensorSelectOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + const IfXprType& ifExpression() const { return m_condition; } + + const ThenXprType& thenExpression() const { return m_then; } + + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 09b0fe66d..03ac8d516 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -17,8 +17,10 @@ template class TensorFi template class TensorMap; template class TensorBase; +template class TensorCwiseNullaryOp; template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; +template class TensorSelectOp; template class TensorDevice; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3fc9c5335..3a2ff5b30 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -45,33 +45,37 @@ template class TensorMap : public Tensor static const int Options = Options_; + static const std::size_t NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), PacketAccess = true, }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array({{firstDimension}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - inline TensorMap(PointerArgType dataPtr, const array& dimensions) + inline TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename PlainObjectType::Dimensions& dimensions() const { return m_dimensions; } + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC @@ -80,7 +84,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -96,12 +100,12 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } @@ -159,7 +163,7 @@ template class TensorMap : public Tensor #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -175,12 +179,12 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } @@ -247,8 +251,8 @@ template class TensorMap : public Tensor } private: - typename PlainObjectType::Scalar* m_data; - typename PlainObjectType::Dimensions m_dimensions; + Scalar* m_data; + Dimensions m_dimensions; }; } // end namespace Eigen -- cgit v1.2.3 From 6fa6cdd2b988da98cbdd2b1a5fd2fd3b9d56a4b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 4 Jun 2014 09:21:48 -0700 Subject: Added support for tensor contractions Updated expression evaluation mechanism to also compute the size of the tensor result Misc fixes and improvements. --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 2 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 38 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 50 +++-- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 229 +++++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 29 +-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 44 +++- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 36 ++-- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 11 +- 14 files changed, 370 insertions(+), 96 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 323d9edff..d4e8d3a15 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -39,6 +39,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index ab869177c..636063f9e 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -23,6 +23,8 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + static const std::size_t size = n; + T values[n]; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index d8ff3f584..e034f8c03 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -81,7 +81,7 @@ class Tensor : public TensorBase > typedef typename Base::PacketReturnType PacketReturnType; enum { - IsAligned = bool(EIGEN_ALIGN), + IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), PacketAccess = true, }; @@ -94,11 +94,11 @@ class Tensor : public TensorBase > TensorStorage m_storage; public: - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED // work, because that uses base().coeffRef() - and we don't yet @@ -116,13 +116,13 @@ class Tensor : public TensorBase > } #endif - inline const Scalar& coeff(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline const Scalar& coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; @@ -138,13 +138,13 @@ class Tensor : public TensorBase > } #endif - inline Scalar& coeffRef(const array& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; @@ -160,19 +160,19 @@ class Tensor : public TensorBase > } #endif - inline const Scalar& operator()(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { eigen_assert(checkIndexRange(indices)); return coeff(indices); } - inline const Scalar& operator()(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return coeff(index); } - inline const Scalar& operator[](Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { // The bracket operator is only for vectors, use the parenthesis operator instead. EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -189,19 +189,19 @@ class Tensor : public TensorBase > } #endif - inline Scalar& operator()(const array& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { eigen_assert(checkIndexRange(indices)); return coeffRef(indices); } - inline Scalar& operator()(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { eigen_assert(index >= 0 && index < size()); return coeffRef(index); } - inline Scalar& operator[](Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { // The bracket operator is only for vectors, use the parenthesis operator instead EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -223,11 +223,10 @@ class Tensor : public TensorBase > #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage() + : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array{{firstDimension, otherDimensions...}}); } #endif @@ -237,7 +236,6 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) @@ -306,7 +304,7 @@ class Tensor : public TensorBase > array_zip_and_reduce(indices, m_storage.dimensions()); } - inline Index linearizedIndex(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { if (Options&RowMajor) { return m_storage.dimensions().IndexOfRowMajor(indices); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index e69ff6188..da1eb62cb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -53,7 +53,6 @@ template struct TensorAssign { typedef typename Derived1::Index Index; - EIGEN_DEVICE_FUNC static inline void run(Derived1& dst, const Derived2& src) { TensorEvaluator evalDst(dst); @@ -63,7 +62,7 @@ struct TensorAssign static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; static const int PacketSize = unpacket_traits::PacketReturnType>::size; - static const int VectorizedSize = (size / PacketSize) * PacketSize; + const int VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evalDst.template writePacket(i, evalSrc.template packet(i)); @@ -148,7 +147,7 @@ struct TensorAssignMultiThreaded // GPU: the evaluation of the expressions is offloaded to a GPU. -#ifdef EIGEN_USE_GPU +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template __global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { const int index = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 8a88ba806..c5c711313 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,13 +30,16 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; - Derived& setZero() { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { return setConstant(Scalar(0)); } - Derived& setConstant(const Scalar& val) { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { return derived() = constant(val); } - Derived& setRandom() { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { return derived() = random(); } @@ -45,13 +48,13 @@ class TensorBase EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> constant(const Scalar& value) const { return TensorCwiseNullaryOp, const Derived> - (internal::scalar_constant_op(value)); + (derived(), internal::scalar_constant_op(value)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(); + return TensorCwiseNullaryOp, const Derived>(derived()); } // Coefficient-wise unary operators @@ -124,77 +127,86 @@ class TensorBase // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator+(const OtherDerived& other) const { + operator+(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator-(const OtherDerived& other) const { + operator-(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator*(const OtherDerived& other) const { + operator*(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator/(const OtherDerived& other) const { + operator/(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMax(const OtherDerived& other) const { + cwiseMax(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMin(const OtherDerived& other) const { + cwiseMin(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { + operator<(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { + operator<=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { + operator>(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { + operator>=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { + operator==(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { + operator!=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + // Contractions. + typedef std::pair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + // Coefficient-wise ternary operators. - template + template inline const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const{ + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 000000000..d424df36e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +} // end namespace internal + + + +template +class TensorContractionOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorContractionOp XprType; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ + false, + }; + + TensorEvaluator(const XprType& op) + : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + { + Index index = 0; + Index stride = 1; + m_shiftright = 1; + + int skipped = 0; + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].first == i) { + skip = true; + m_leftOffsets[2*skipped] = stride; + m_leftOffsets[2*skipped+1] = stride * left_dims[i]; + m_stitchsize[skipped] = left_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = left_dims[i]; + m_shiftright *= left_dims[i]; + } else { + ++skipped; + } + stride *= left_dims[i]; + } + + stride = 1; + skipped = 0; + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].second == i) { + skip = true; + m_rightOffsets[2*skipped] = stride; + m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = right_dims[i]; + } else { + ++skipped; + } + stride *= right_dims[i]; + } + + // Scalar case + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + m_dimensions[0] = 1; + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const Dimensions& dimensions() const { return m_dimensions; } + + void evalTo(typename XprType::Scalar* buffer) const { + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + const Index startLeft = index % m_shiftright; + const Index startRight = index / m_shiftright; + CoeffReturnType result = CoeffReturnType(0); + partialStitch(startLeft, startRight, 0, result); + return result; + } + + /* TODO: vectorization + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + assert(false); + }*/ + + private: + EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { + Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); + Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); + + for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { + const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; + const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; + if (StitchIndex < Indices::size-1) { + partialStitch(left, right, StitchIndex+1, accum); + } else { + accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); + } + } + } + + private: + array m_leftOffsets; + array m_rightOffsets; + array m_stitchsize; + Index m_shiftright; + Dimensions m_dimensions; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 71890e187..dbe60a165 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -59,7 +59,7 @@ template class TensorDevice class TensorDevice { public: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index ded6ca604..d7f5ab7c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,17 +37,14 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU struct GpuDevice { - // todo: support for multiple gpu; - GpuDevice() { - cudaStreamCreate(&stream_); - } - ~GpuDevice() { - cudaStreamDestroy(stream_); - } - const cudaStream_t& stream() const { return stream_; } + // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. + GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } + + const cudaStream_t& stream() const { return *stream_; } private: - cudaStream_t stream_; + // TODO: multigpu. + const cudaStream_t* stream_; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 43e9d6550..c92b8c679 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -35,14 +35,14 @@ namespace Eigen { namespace internal { template struct dget { - static const std::size_t value = internal::get::value; - }; + static const std::size_t value = get::value; +}; template struct fixed_size_tensor_index_linearization_helper { - template + template EIGEN_DEVICE_FUNC static inline Index run(array const& indices, const Dimensions& dimensions) { @@ -55,7 +55,7 @@ struct fixed_size_tensor_index_linearization_helper template struct fixed_size_tensor_index_linearization_helper { - template + template EIGEN_DEVICE_FUNC static inline Index run(array const& indices, const Dimensions&) { @@ -93,11 +93,11 @@ struct Sizes : internal::numeric_list { return *this; } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } @@ -139,11 +139,11 @@ template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); } @@ -180,13 +180,18 @@ struct tensor_index_linearization_helper template struct DSizes : array { typedef array Base; + static const std::size_t count = NumDims; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } - DSizes() { } - explicit DSizes(const array& a) : Base(a) { } + EIGEN_DEVICE_FUNC DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } DSizes& operator = (const array& other) { *static_cast(this) = other; @@ -194,10 +199,10 @@ struct DSizes : array { } // A constexpr would be so much better here - size_t IndexOfColMajor(const array& indices) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } - size_t IndexOfRowMajor(const array& indices) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index e0c0863b7..ab2513cea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -21,7 +21,6 @@ namespace Eigen { * * TODO: add support for more types of expressions, in particular expressions * leading to lvalues (slicing, reshaping, etc...) - * TODO: add support for vectorization */ template @@ -32,16 +31,19 @@ struct TensorEvaluator typedef typename Derived::Packet Packet; typedef typename Derived::Scalar CoeffReturnType; typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, }; - TensorEvaluator(Derived& m) - : m_data(const_cast(m.data())) + EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m) + : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_data[index]; } @@ -64,29 +66,34 @@ struct TensorEvaluator protected: Scalar* m_data; + Dimensions m_dims; }; // -------------------- CwiseNullaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator > { - typedef TensorCwiseNullaryOp XprType; + typedef TensorCwiseNullaryOp XprType; enum { IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, }; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) - : m_functor(op.functor()) + : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -101,6 +108,7 @@ struct TensorEvaluator > private: const NullaryOp m_functor; + TensorEvaluator m_argImpl; }; @@ -117,7 +125,7 @@ struct TensorEvaluator > PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { } @@ -125,6 +133,9 @@ struct TensorEvaluator > typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -156,7 +167,7 @@ struct TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) @@ -165,6 +176,13 @@ struct TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -196,7 +214,7 @@ struct TensorEvaluator TensorEvaluator::PacketAccess*/, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_condImpl(op.ifExpression()), m_thenImpl(op.thenExpression()), m_elseImpl(op.elseExpression()) @@ -205,7 +223,13 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 94cfae05c..60908ee94 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -28,13 +28,13 @@ namespace Eigen { * */ namespace internal { -template -struct traits > - : traits +template +struct traits > + : traits { - typedef typename PlainObjectType::Packet Packet; - typedef typename PlainObjectType::Scalar Scalar; - typedef typename PlainObjectType::Nested XprTypeNested; + typedef typename XprType::Packet Packet; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -42,27 +42,31 @@ struct traits > -template -class TensorCwiseNullaryOp : public TensorBase > +template +class TensorCwiseNullaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; - typedef typename PlainObjectType::PacketReturnType PacketReturnType; - typedef TensorCwiseNullaryOp Nested; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef TensorCwiseNullaryOp Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const NullaryOp& func = NullaryOp()) - : m_functor(func) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; } protected: - // todo: add tensor dimension to be able to do some sanity checks + typename XprType::Nested m_xpr; const NullaryOp m_functor; }; @@ -71,7 +75,7 @@ class TensorCwiseNullaryOp : public TensorBase struct traits > - : traits + : traits { typedef typename result_of< UnaryOp(typename XprType::Scalar) @@ -207,7 +211,7 @@ class TensorCwiseBinaryOp : public TensorBase struct traits > - : traits + : traits { typedef typename traits::Scalar Scalar; typedef typename internal::packet_traits::type Packet; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index dcc7ccd65..789c04238 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -52,7 +52,7 @@ class TensorFixedSize : public TensorBase dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 03ac8d516..239b5cb67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,6 +21,8 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; +template class TensorReductionOp; +template class TensorContractionOp; template class TensorDevice; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 64098343e..c9d6517eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -53,7 +53,7 @@ class TensorStorage EIGEN_STRONG_INLINE const T *data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return m_dimensions; } + EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } @@ -111,7 +111,8 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - const DSizes& dimensions() const {return m_dimensions;} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const {return m_dimensions;} void conservativeResize(DenseIndex size, const array& nbDimensions) { @@ -132,10 +133,10 @@ class TensorStorage Date: Thu, 5 Jun 2014 10:49:34 -0700 Subject: Created additional tests for the tensor code. --- unsupported/test/CMakeLists.txt | 2 + unsupported/test/cxx11_tensor_comparisons.cpp | 84 +++++++++++++ unsupported/test/cxx11_tensor_contraction.cpp | 163 ++++++++++++++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 17 ++- unsupported/test/cxx11_tensor_expr.cpp | 149 ++++++++++++++++++++--- unsupported/test/cxx11_tensor_fixed_size.cpp | 14 +-- unsupported/test/cxx11_tensor_thread_pool.cpp | 7 +- 7 files changed, 406 insertions(+), 30 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_comparisons.cpp create mode 100644 unsupported/test/cxx11_tensor_contraction.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index abc3375e5..d6072c9f3 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,6 +102,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_comparison "-std=c++0x") + ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_device "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp new file mode 100644 index 000000000..186f56ac3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_orderings() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor lt(2,3,7); + Tensor le(2,3,7); + Tensor gt(2,3,7); + Tensor ge(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + + lt = mat1 < mat2; + le = mat1 <= mat2; + gt = mat1 > mat2; + ge = mat1 >= mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k)); + VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k)); + VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k)); + VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k)); + } + } + } +} + + +static void test_equality() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + if (random() < 0.5) { + mat2(i,j,k) = mat1(i,j,k); + } + } + } + } + + Tensor eq(2,3,7); + Tensor ne(2,3,7); + eq = (mat1 == mat2); + ne = (mat1 != mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k)); + VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_comparisons() +{ + CALL_SUBTEST(test_orderings()); + CALL_SUBTEST(test_equality()); +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp new file mode 100644 index 000000000..1c89dfdd1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -0,0 +1,163 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +typedef Tensor::DimensionPair DimPair; + + +static void test_evals() +{ + Tensor mat1(2, 3); + Tensor mat2(2, 3); + Tensor mat3(3, 2); + + mat1.setRandom(); + mat2.setRandom(); + mat3.setRandom(); + + Tensor mat4(3,3); + mat4.setZero(); + Eigen::array dims3({{DimPair(0, 0)}}); + TensorEvaluator eval(mat1.contract(mat2, dims3)); + eval.evalTo(mat4.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 3); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2)); + + Tensor mat5(2,2); + mat5.setZero(); + Eigen::array dims4({{DimPair(1, 1)}}); + TensorEvaluator eval2(mat1.contract(mat2, dims4)); + eval2.evalTo(mat5.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval2.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2)); + VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2)); + + Tensor mat6(2,2); + mat6.setZero(); + Eigen::array dims6({{DimPair(1, 0)}}); + TensorEvaluator eval3(mat1.contract(mat3, dims6)); + eval3.evalTo(mat6.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval3.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval3.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1)); + VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1)); +} + + +static void test_scalar() +{ + Tensor vec1({6}); + Tensor vec2({6}); + + vec1.setRandom(); + vec2.setRandom(); + + Tensor scalar(1); + scalar.setZero(); + Eigen::array dims({{DimPair(0, 0)}}); + TensorEvaluator eval(vec1.contract(vec2, dims)); + eval.evalTo(scalar.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + + float expected = 0.0f; + for (int i = 0; i < 6; ++i) { + expected += vec1(i) * vec2(i); + } + VERIFY_IS_APPROX(scalar(0), expected); +} + + +static void test_multidims() +{ + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2, 2, 2); + mat3.setZero(); + Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); + TensorEvaluator eval(mat1.contract(mat2, dims)); + eval.evalTo(mat3.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 2); + VERIFY_IS_EQUAL(eval.dimensions()[2], 2); + + VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) + + mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) + + mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) + + mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) + + mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1)); + VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) + + mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) + + mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) + + mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) + + mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); +} + + +static void test_expr() +{ + Tensor mat1(2, 3); + Tensor mat2(3, 2); + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2,2); + + Eigen::array dims({{DimPair(1, 0)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1)); + VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); +} + + +void test_cxx11_tensor_contraction() +{ + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_expr()); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 9eb1d0420..365b109c7 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -15,7 +15,7 @@ #include "main.h" -#include +#include using Eigen::Tensor; using Eigen::RowMajor; @@ -39,8 +39,12 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { - GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out) { } - + GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { + cudaStreamCreate(&stream_); + } + ~GPUContext() { + cudaStreamDestroy(stream_); + } const Eigen::TensorMap >& in1() const { return in1_; } const Eigen::TensorMap >& in2() const { return in2_; } Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } @@ -49,6 +53,7 @@ struct GPUContext { const Eigen::TensorMap >& in1_; const Eigen::TensorMap >& in2_; Eigen::TensorMap >& out_; + cudaStream_t stream_; Eigen::GpuDevice gpu_device_; }; @@ -57,7 +62,7 @@ struct GPUContext { template static void test_contextual_eval(Context* context) { - context->out() = context->in1() + context->in2() * 3.14f; + context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } static void test_cpu() { @@ -73,7 +78,7 @@ static void test_cpu() { for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } @@ -111,7 +116,7 @@ static void test_gpu() { for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index e0124da8c..e85fcbfa9 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -28,10 +28,10 @@ static void test_1d() float data3[6]; TensorMap> vec3(data3, 6); - vec3 = vec1.cwiseSqrt(); + vec3 = vec1.sqrt(); float data4[6]; TensorMap> vec4(data4, 6); - vec4 = vec2.cwiseSqrt(); + vec4 = vec2.square(); VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); @@ -40,12 +40,12 @@ static void test_1d() VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); - VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); - VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); - VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); - VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); - VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); - VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + VERIFY_IS_APPROX(vec4(0), 0.0f); + VERIFY_IS_APPROX(vec4(1), 1.0f); + VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f); + VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f); + VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f); + VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f); vec3 = vec1 + vec2; VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); @@ -79,8 +79,8 @@ static void test_2d() Tensor mat3(2,3); Tensor mat4(2,3); - mat3 = mat1.cwiseAbs(); - mat4 = mat2.cwiseAbs(); + mat3 = mat1.abs(); + mat4 = mat2.abs(); VERIFY_IS_APPROX(mat3(0,0), 0.0f); VERIFY_IS_APPROX(mat3(0,1), 1.0f); @@ -102,7 +102,7 @@ static void test_3d() Tensor mat1(2,3,7); Tensor mat2(2,3,7); - float val = 0.0; + float val = 1.0; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -118,28 +118,147 @@ static void test_3d() Tensor mat4(2,3,7); mat4 = mat2 * 3.14f; Tensor mat5(2,3,7); - mat5 = mat1.cwiseSqrt().cwiseSqrt(); + mat5 = mat1.inverse().log(); Tensor mat6(2,3,7); - mat6 = mat2.cwiseSqrt() * 3.14f; + mat6 = mat2.pow(0.5f) * 3.14f; + Tensor mat7(2,3,7); + mat7 = mat1.cwiseMax(mat5 * 2.0f).exp(); + Tensor mat8(2,3,7); + mat8 = (-mat2).exp() * 3.14f; - val = 0.0; + val = 1.0; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), val + val); VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); - VERIFY_IS_APPROX(mat5(i,j,k), sqrtf(sqrtf(val))); + VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val)); VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); + VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); + VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f); val += 1.0; } } } } +static void test_constants() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.constant(3.14f); + mat3 = mat1.cwiseMax(7.3f).exp(); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), 3.14f); + VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f))); + val += 1.0; + } + } + } +} + + +static void test_functors() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.inverse().unaryExpr(&asinf); + mat3 = mat1.unaryExpr(&tanhf); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k))); + VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k))); + val += 1.0; + } + } + } +} + +static void test_type_casting() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + mat1.setRandom(); + mat2.setRandom(); + + mat3 = mat1.template cast(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0); + } + } + } + + mat3 = mat2.template cast(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), static_cast(mat2(i,j,k))); + } + } + } +} + +static void test_select() +{ + Tensor selector(2,3,7); + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor result(2,3,7); + + selector.setRandom(); + mat1.setRandom(); + mat2.setRandom(); + result = (selector > selector.constant(0.5f)).select(mat1, mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k)); + } + } + } +} + void test_cxx11_tensor_expr() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_constants()); + CALL_SUBTEST(test_functors()); + CALL_SUBTEST(test_type_casting()); + CALL_SUBTEST(test_select()); } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 214f6951d..d270486f2 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -33,10 +33,10 @@ static void test_1d() float data3[6]; TensorMap > > vec3(data3, 6); - vec3 = vec1.cwiseSqrt(); + vec3 = vec1.sqrt(); float data4[6]; TensorMap, RowMajor> > vec4(data4, 6); - vec4 = vec2.cwiseSqrt(); + vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); @@ -92,8 +92,8 @@ static void test_2d() TensorFixedSize> mat3; TensorFixedSize, RowMajor> mat4; - mat3 = mat1.cwiseAbs(); - mat4 = mat2.cwiseAbs(); + mat3 = mat1.abs(); + mat4 = mat2.abs(); VERIFY_IS_EQUAL((mat3.size()), 2*3); // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); @@ -136,9 +136,9 @@ static void test_3d() } TensorFixedSize > mat3; - mat3 = mat1.cwiseSqrt(); + mat3 = mat1.sqrt(); TensorFixedSize, RowMajor> mat4; - mat4 = mat2.cwiseSqrt(); + mat4 = mat2.sqrt(); VERIFY_IS_EQUAL((mat3.size()), 2*3*7); // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); @@ -173,7 +173,7 @@ static void test_array() } TensorFixedSize > mat3; - mat3 = mat1.cwisePow(3.5f); + mat3 = mat1.pow(3.5f); val = 0.0; for (int i = 0; i < 2; ++i) { diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index c9de71da3..b371e8a71 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -12,6 +12,7 @@ #include "main.h" #include +#include "thread/threadpool.h" using Eigen::Tensor; @@ -24,8 +25,10 @@ void test_cxx11_tensor_thread_pool() in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); - out.device(thread_pool_device) = in1 + in2 * 3.14; + ThreadPool thread_pool(2); + thread_pool.StartWorkers(); + Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3); + out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { -- cgit v1.2.3 From a961d72e65fc537fe571845407b4e2ee0554bd49 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 16:25:16 -0700 Subject: Added support for convolution and reshaping of tensors. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++ .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 206 +++++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 119 ++++++++++++ unsupported/test/cxx11_tensor_convolution.cpp | 70 +++++++ 6 files changed, 413 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h create mode 100644 unsupported/test/cxx11_tensor_convolution.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d4e8d3a15..c67020581 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -40,6 +40,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index c5c711313..932e5c82d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -203,6 +203,13 @@ class TensorBase return TensorContractionOp(derived(), other.derived(), dims); } + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + // Coefficient-wise ternary operators. template inline const TensorSelectOp @@ -210,6 +217,13 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + // Morphing operators (slicing tbd). + template + inline const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 000000000..ca2e0e562 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,206 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = TensorEvaluator::Dimensions::count; + static const int KernelDims = Indices::size; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ + false, + }; + + TensorEvaluator(const XprType& op) + : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions()) + { + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; + } else { + m_inputStride[0] = 1; + } + } + + for (int i = 0; i < KernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; + } else { + m_outputStride[0] = 1; + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const Dimensions& dimensions() const { return m_dimensions; } + + void evalTo(typename XprType::Scalar* buffer) const { + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index startInput = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + + CoeffReturnType result = CoeffReturnType(0); + convolve(startInput, 0, 0, result); + return result; + } + + /* TODO: vectorization + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + assert(false); + }*/ + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex < KernelDims-1) { + convolve(input, kernel, DimIndex+1, accum); + } else { + + accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + } + } + } + + private: + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + Dimensions m_dimensions; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 239b5cb67..b8833362c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,9 +21,9 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorReductionOp; template class TensorContractionOp; - +template class TensorConvolutionOp; +template class TensorReshapingOp; template class TensorDevice; // Move to internal? diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 000000000..3e089fe1e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReshapingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReshapingOp type; +}; + +} // end namespace internal + + + +template +class TensorReshapingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorReshapingOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + TensorEvaluator(const XprType& op) + : m_impl(op.expression()), m_dimensions(op.dimensions()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const NewDimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + private: + NewDimensions m_dimensions; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp new file mode 100644 index 000000000..95e40f64f --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -0,0 +1,70 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_evals() +{ + Tensor input(3, 3); + Tensor kernel(2); + + input.setRandom(); + kernel.setRandom(); + + Tensor result(2,3); + result.setZero(); + Eigen::array::Index, 1> dims3({0}); + + TensorEvaluator eval(input.convolve(kernel, dims3)); + eval.evalTo(result.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 +} + + +static void test_expr() +{ + Tensor input(3, 3); + Tensor kernel(2, 2); + input.setRandom(); + kernel.setRandom(); + + Tensor result(2,2); + Eigen::array dims({0, 1}); + result = input.convolve(kernel, dims); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); +} + + +void test_cxx11_tensor_convolution() +{ + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_expr()); +} -- cgit v1.2.3 From 79085e08e9512f678b4584df49d1b2835b40117f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:16:13 -0700 Subject: Fixed a typo --- unsupported/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6072c9f3..e67e61263 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,7 +102,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") - ei_add_test(cxx11_tensor_comparison "-std=c++0x") + ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") -- cgit v1.2.3 From 29aebf96e62f4fb5e4b1f3fb475e299df2e7a02e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:18:44 -0700 Subject: Created the pblend packet primitive and implemented it using SSE and AVX instructions. --- Eigen/src/Core/GenericPacketMath.h | 14 ++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 15 +++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 8 ++++++- Eigen/src/Core/arch/SSE/PacketMath.h | 41 +++++++++++++++++++++++++++++++++--- test/packetmath.cpp | 16 ++++++++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 98313c68f..0869dd49f 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -429,6 +430,19 @@ ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template struct Selector { + bool select[N]; +}; + +template EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8b8307d75..688ff91e4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -59,6 +59,7 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 0, HasSqrt = 0 + HasBlend = 1, }; }; template<> struct packet_traits : default_packet_traits @@ -73,6 +74,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 0 + HasBlend = 1, }; }; @@ -557,6 +559,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 758183c18..0bc03cf9e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6912f3bc3..1124b24df 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -108,7 +108,8 @@ template<> struct packet_traits : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -809,6 +813,37 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9dab07522..663ab886d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -261,6 +261,22 @@ template void packetmath() VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose"); } } + + if (internal::packet_traits::HasBlend) { + Packet thenPacket = internal::pload(data1); + Packet elsePacket = internal::pload(data2); + EIGEN_ALIGN_DEFAULT internal::Selector selector; + for (int i = 0; i < PacketSize; ++i) { + selector.select[i] = i; + } + + Packet blend = internal::pblend(selector, thenPacket, elsePacket); + EIGEN_ALIGN_DEFAULT Scalar result[size]; + internal::pstore(result, blend); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); + } + } } template void packetmath_real() -- cgit v1.2.3 From 8c8ae2d8193809744f5952713287639817e2b442 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 7 Jun 2014 11:24:38 -0700 Subject: Fixed a typo --- Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 688ff91e4..74d3746d9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -58,8 +58,8 @@ template<> struct packet_traits : default_packet_traits HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0 - HasBlend = 1, + HasSqrt = 0, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -73,8 +73,8 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0 - HasBlend = 1, + HasExp = 0, + HasBlend = 1 }; }; -- cgit v1.2.3 From fe102248ac8f78e33064caeb5cdea6fc41af637c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:19:21 -0700 Subject: Fixed the threadpool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index b371e8a71..2e67b2064 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -12,7 +12,6 @@ #include "main.h" #include -#include "thread/threadpool.h" using Eigen::Tensor; @@ -25,9 +24,7 @@ void test_cxx11_tensor_thread_pool() in1.setRandom(); in2.setRandom(); - ThreadPool thread_pool(2); - thread_pool.StartWorkers(); - Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3); + Eigen::ThreadPoolDevice thread_pool_device(3); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { -- cgit v1.2.3 From 2859a31ac80af86fa58e5347be50d32fd07bcd3c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:42:34 -0700 Subject: Fixed compilation error --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3a2ff5b30..3a06170fa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -54,18 +54,18 @@ template class TensorMap : public Tensor PacketAccess = true, }; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif inline TensorMap(PointerArgType dataPtr, const array& dimensions) -- cgit v1.2.3 From 36a2b2e9dc9368356b3f327a1fb00616397c1e0e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:43:51 -0700 Subject: Prevent the generation of unlaunchable cuda kernels when compiling in debug mode. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index c92b8c679..3e5687915 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -73,7 +73,7 @@ struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; static const std::size_t total_size = internal::arg_prod(Indices...); - static std::size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { return internal::arg_prod(Indices...); } @@ -119,7 +119,7 @@ template ::value; - static size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { return internal::arg_prod::value; } @@ -156,7 +156,8 @@ namespace internal { template struct tensor_index_linearization_helper { - static inline Index run(array const& indices, array const& dimensions) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const& dimensions) { return array_get(indices) + array_get(dimensions) * @@ -167,7 +168,8 @@ struct tensor_index_linearization_helper template struct tensor_index_linearization_helper { - static inline Index run(array const& indices, array const&) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const&) { return array_get(indices); } -- cgit v1.2.3 From a669052f12d6d71ba815764d6419726d64fef675 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:45:30 -0700 Subject: Improved support for rvalues in tensor expressions. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 58 ++++++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++ .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 4 ++ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 8 +++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 6 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 6 ++- 7 files changed, 71 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 932e5c82d..e447a5d40 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -22,7 +22,7 @@ namespace Eigen { */ template -class TensorBase +class TensorBase { public: typedef typename internal::traits::Scalar Scalar; @@ -30,19 +30,6 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = random(); - } - // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -224,14 +211,53 @@ class TensorBase return TensorReshapingOp(derived(), newDimensions); } + protected: + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + + +template +class TensorBase : public TensorBase { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Index Index; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + + template friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { return TensorDevice(device, derived()); } - protected: - template friend class TensorBase; + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d424df36e..d371eb76d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -35,6 +35,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index ca2e0e562..501e9a522 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -35,6 +35,10 @@ struct traits > typedef typename KernelXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 60908ee94..de66da13f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -36,6 +36,10 @@ struct traits > typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + + enum { + Flags = 0, + }; }; } // end namespace internal @@ -153,6 +157,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b8833362c..1fb90478f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -15,7 +15,7 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; template class TensorMap; -template class TensorBase; +template::value> class TensorBase; template class TensorCwiseNullaryOp; template class TensorCwiseUnaryOp; @@ -29,6 +29,10 @@ template class TensorDevice; // Move to internal? template struct TensorEvaluator; +namespace internal { +template struct TensorAssign; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3e089fe1e..7d5f9271e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { template -struct traits > +struct traits > : public traits { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -81,6 +81,7 @@ template struct TensorEvaluator > { typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; enum { IsAligned = TensorEvaluator::IsAligned, @@ -95,7 +96,7 @@ struct TensorEvaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - const NewDimensions& dimensions() const { return m_dimensions; } + const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 2de698a57..40f805741 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -52,7 +52,7 @@ struct traits > typedef DenseIndex Index; enum { Options = Options_, - Flags = compute_tensor_flags::ret, + Flags = compute_tensor_flags::ret | LvalueBit, }; }; @@ -63,6 +63,10 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | LvalueBit, + }; }; -- cgit v1.2.3 From a77458a8ff2a83e716add62253eb50ef64980b21 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 10:06:57 -0700 Subject: Fixes compilation errors triggered when compiling the tensor contraction code with cxx11 enabled. --- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 6 ++++++ .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 17 +++++++++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 16 ++++++++-------- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index f102872ae..423ca4be4 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,12 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 636063f9e..1d3164d6a 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -182,23 +182,32 @@ array repeat(t v) { } template -t array_prod(const array& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; for (size_t i = 0; i < n; ++i) { prod *= a[i]; } return prod; } template -t array_prod(const array& /*a*/) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } -template inline T& array_get(array& a) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; } -template inline const T& array_get(const array& a) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const T& array_get(const array& a) { return a[I]; } + +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + struct sum_op { template static inline bool run(A a, B b) { return a + b; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d371eb76d..5149de1bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -107,7 +107,7 @@ struct TensorEvaluator XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; @@ -128,7 +128,7 @@ struct TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { skip = true; m_leftOffsets[2*skipped] = stride; @@ -151,7 +151,7 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { skip = true; m_rightOffsets[2*skipped] = stride; @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -209,7 +209,7 @@ struct TensorEvaluator::value-1) { partialStitch(left, right, StitchIndex+1, accum); } else { accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); @@ -218,9 +218,9 @@ struct TensorEvaluator m_leftOffsets; - array m_rightOffsets; - array m_stitchsize; + array::value> m_leftOffsets; + array::value> m_rightOffsets; + array::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; TensorEvaluator m_leftImpl; -- cgit v1.2.3 From 925fb6b93710b95082ba44d30405289dff3707eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Jun 2014 09:14:44 -0700 Subject: TensorEval are now typed on the device: this will make it possible to use partial template specialization to optimize the strategy of each evaluator for each device type. Started work on partial evaluations. --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 42 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++-- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 26 ++++---- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 20 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 28 ++++++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 +++++++++++----------- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 9 +-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 ++-- 9 files changed, 129 insertions(+), 102 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index da1eb62cb..633a7a31b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -32,15 +32,15 @@ namespace Eigen { namespace internal { // Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> +template::PacketAccess & TensorEvaluator::PacketAccess> struct TensorAssign { typedef typename Derived1::Index Index; EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); for (Index i = 0; i < size; ++i) { evalDst.coeffRef(i) = evalSrc.coeff(i); @@ -49,19 +49,19 @@ struct TensorAssign }; -template -struct TensorAssign +template +struct TensorAssign { typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int PacketSize = unpacket_traits::PacketReturnType>::size; const int VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { @@ -116,12 +116,12 @@ struct TensorAssignMultiThreaded typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, DefaultDevice()); + TensorEvaluator evalSrc(src, Defaultevice()); const Index size = dst.size(); - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); @@ -131,7 +131,7 @@ struct TensorAssignMultiThreaded vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); + results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -167,19 +167,19 @@ struct TensorAssignGpu typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); const int block_size = std::min(size, 32*32); const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); + EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); const int remaining_items = size % block_size; if (remaining_items > 0) { const int peel_start_offset = num_blocks * block_size; const int peel_block_size = std::min(size, 32); const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); + EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index e447a5d40..6b53d2a3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -198,19 +198,25 @@ class TensorBase } // Coefficient-wise ternary operators. - template - inline const TensorSelectOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } // Morphing operators (slicing tbd). - template - inline const TensorReshapingOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { return TensorReshapingOp(derived(), newDimensions); } + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + protected: template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 5149de1bb..cadbabda2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -102,31 +102,31 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorContractionOp XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { Index index = 0; Index stride = 1; m_shiftright = 1; int skipped = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { @@ -148,8 +148,8 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -223,8 +223,8 @@ struct TensorEvaluator::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 501e9a522..a554b8260 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -94,27 +94,27 @@ class TensorConvolutionOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorConvolutionOp XprType; - static const int NumDims = TensorEvaluator::Dimensions::count; + static const int NumDims = TensorEvaluator::Dimensions::count; static const int KernelDims = Indices::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions()) { - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); for (int i = 0; i < NumDims; ++i) { if (i > 0) { @@ -200,8 +200,8 @@ struct TensorEvaluator m_indexStride; array m_kernelStride; Dimensions m_dimensions; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index dbe60a165..ce524a818 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -31,7 +31,7 @@ template class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other); + internal::TensorAssign::run(m_expression, other, m_device); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index d7f5ab7c9..142edda14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -15,6 +15,12 @@ namespace Eigen { // Default device for the machine (typically a single cpu core) struct DefaultDevice { + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } }; @@ -22,14 +28,19 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } - /*ThreadPool* threadPool() const { return pool_; }*/ + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } private: // todo: NUMA, ... size_t num_threads_; - /*ThreadPool* pool_;*/ }; #endif @@ -40,7 +51,16 @@ struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } - const cudaStream_t& stream() const { return *stream_; } + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + void* result; + cudaMalloc(&result, num_bytes); + return result; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + cudaFree(buffer); + } private: // TODO: multigpu. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ab2513cea..80fe06957 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -23,7 +23,7 @@ namespace Eigen { * leading to lvalues (slicing, reshaping, etc...) */ -template +template struct TensorEvaluator { typedef typename Derived::Index Index; @@ -38,7 +38,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m) + EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } @@ -73,8 +73,8 @@ struct TensorEvaluator // -------------------- CwiseNullaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; @@ -84,14 +84,14 @@ struct TensorEvaluator > }; EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op) - : m_functor(op.functor()), m_argImpl(op.nestedExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -108,32 +108,32 @@ struct TensorEvaluator > private: const NullaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseUnaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -150,33 +150,33 @@ struct TensorEvaluator > private: const UnaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseBinaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseBinaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_leftImpl(op.lhsExpression()), - m_rightImpl(op.rhsExpression()) + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -196,34 +196,34 @@ struct TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; // -------------------- SelectOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorSelectOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & TensorEvaluator::PacketAccess*/, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) - : m_condImpl(op.ifExpression()), - m_thenImpl(op.thenExpression()), - m_elseImpl(op.elseExpression()) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -248,9 +248,9 @@ struct TensorEvaluator } private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 1fb90478f..27bfe1d73 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,16 +21,17 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; +template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; -template class TensorDevice; +template class TensorForcedEvalOp; -// Move to internal? -template struct TensorEvaluator; +template class TensorDevice; +template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template struct TensorAssign; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 7d5f9271e..e9e74581f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -77,19 +77,19 @@ class TensorReshapingOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op) - : m_impl(op.expression()), m_dimensions(op.dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { } typedef typename XprType::Index Index; @@ -111,7 +111,7 @@ struct TensorEvaluator > private: NewDimensions m_dimensions; - TensorEvaluator m_impl; + TensorEvaluator m_impl; }; -- cgit v1.2.3 From aa664eabb912a1b96e417e9a8d9c98f423b7fc23 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Jun 2014 10:31:29 -0700 Subject: Fixed a few compilation errors. --- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/test/CMakeLists.txt | 9 ++++---- unsupported/test/cxx11_tensor_contraction.cpp | 26 +++++++++++++--------- unsupported/test/cxx11_tensor_convolution.cpp | 7 +++--- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a554b8260..c4cfe0cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ struct TensorEvaluator XprType; static const int NumDims = TensorEvaluator::Dimensions::count; - static const int KernelDims = Indices::size; + static const int KernelDims = internal::array_size::value; typedef typename XprType::Index Index; typedef DSizes Dimensions; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 4a151bfa7..34130a192 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -95,9 +95,8 @@ ei_add_test(bdcsvd) option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON) if(EIGEN_TEST_CXX11) - # FIXME: add C++11 compiler switch in some portable way - # (MSVC doesn't need any for example, so this will - # clash there) + # It should be safe to always run these tests as there is some fallback code for + # older compiler that don't support cxx11. ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") @@ -107,7 +106,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") - ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") +# ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 1c89dfdd1..fc67d500b 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -11,6 +11,7 @@ #include +using Eigen::DefaultDevice; using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; @@ -29,9 +30,10 @@ static void test_evals() Tensor mat4(3,3); mat4.setZero(); Eigen::array dims3({{DimPair(0, 0)}}); - TensorEvaluator eval(mat1.contract(mat2, dims3)); + typedef TensorEvaluator Evaluator; + Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice()); eval.evalTo(mat4.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 3); VERIFY_IS_EQUAL(eval.dimensions()[1], 3); @@ -48,9 +50,10 @@ static void test_evals() Tensor mat5(2,2); mat5.setZero(); Eigen::array dims4({{DimPair(1, 1)}}); - TensorEvaluator eval2(mat1.contract(mat2, dims4)); + typedef TensorEvaluator Evaluator2; + Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice()); eval2.evalTo(mat5.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); VERIFY_IS_EQUAL(eval2.dimensions()[1], 2); @@ -62,9 +65,10 @@ static void test_evals() Tensor mat6(2,2); mat6.setZero(); Eigen::array dims6({{DimPair(1, 0)}}); - TensorEvaluator eval3(mat1.contract(mat3, dims6)); + typedef TensorEvaluator Evaluator3; + Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice()); eval3.evalTo(mat6.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval3.dimensions()[0], 2); VERIFY_IS_EQUAL(eval3.dimensions()[1], 2); @@ -86,9 +90,10 @@ static void test_scalar() Tensor scalar(1); scalar.setZero(); Eigen::array dims({{DimPair(0, 0)}}); - TensorEvaluator eval(vec1.contract(vec2, dims)); + typedef TensorEvaluator Evaluator; + Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); eval.evalTo(scalar.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); float expected = 0.0f; for (int i = 0; i < 6; ++i) { @@ -109,9 +114,10 @@ static void test_multidims() Tensor mat3(2, 2, 2); mat3.setZero(); Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); - TensorEvaluator eval(mat1.contract(mat2, dims)); + typedef TensorEvaluator Evaluator; + Evaluator eval(mat1.contract(mat2, dims), DefaultDevice()); eval.evalTo(mat3.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 2); VERIFY_IS_EQUAL(eval.dimensions()[1], 2); VERIFY_IS_EQUAL(eval.dimensions()[2], 2); diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index 95e40f64f..bafe73edd 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -12,7 +12,7 @@ #include using Eigen::Tensor; - +using Eigen::DefaultDevice; static void test_evals() { @@ -26,9 +26,10 @@ static void test_evals() result.setZero(); Eigen::array::Index, 1> dims3({0}); - TensorEvaluator eval(input.convolve(kernel, dims3)); + typedef TensorEvaluator Evaluator; + Evaluator eval(input.convolve(kernel, dims3), DefaultDevice()); eval.evalTo(result.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 2); VERIFY_IS_EQUAL(eval.dimensions()[1], 3); -- cgit v1.2.3 From 38ab7e6ed0491bd5a0c639f218d5ea4728bf1e81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 09:56:51 -0700 Subject: Reworked the expression evaluation mechanism in order to make it possible to efficiently compute convolutions and contractions in the future: * The scheduling of computation is moved out the the assignment code and into a new TensorExecutor class * The assignment itself is now a regular node on the expression tree * The expression evaluators start by recursively evaluating all their subexpressions if needed --- unsupported/Eigen/CXX11/Tensor | 4 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 250 +++++++++------------ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 8 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 9 + unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 16 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 146 ++++++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 56 ++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 194 ++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 142 ++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 7 + 14 files changed, 685 insertions(+), 164 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c67020581..7e504b302 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,8 +42,12 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 7f614bbe8..09601fc7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -236,7 +236,9 @@ class Tensor : public TensorBase > // FIXME: we need to resize the tensor to fix the dimensions of the other. // Unfortunately this isn't possible yet when the rhs is an expression. // resize(other.dimensions()); - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 633a7a31b..a2a925775 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorAssign @@ -21,172 +17,134 @@ namespace Eigen { * * \brief The tensor assignment class. * - * This class is responsible for triggering the evaluation of the expressions - * used on the lhs and rhs of an assignment operator and copy the result of - * the evaluation of the rhs expression at the address computed during the - * evaluation lhs expression. - * - * TODO: vectorization. For now the code only uses scalars - * TODO: parallelisation using multithreading on cpu, or kernels on gpu. + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. */ namespace internal { - -// Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> -struct TensorAssign +template +struct traits > { - typedef typename Derived1::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - for (Index i = 0; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef typename LhsXprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; +template +struct eval, Eigen::Dense> +{ + typedef const TensorAssignOp& type; +}; -template -struct TensorAssign +template +struct nested, 1, typename eval >::type> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evalDst.template writePacket(i, evalSrc.template packet(i)); - } - for (Index i = VectorizedSize; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef TensorAssignOp type; }; +} // end namespace internal -// Multicore strategy: the index space is partitioned and each core is assigned to a partition -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - for (Index i = first; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } -}; -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - - Index i = first; - static const int PacketSize = unpacket_traits::size; - if (last - first > PacketSize) { - static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned; - eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { - dst.template writePacket(i, src.template packet(i)); - } - } - - for (; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename LhsXprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; }; -template -struct TensorAssignMultiThreaded + +template +struct TensorEvaluator, Device> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) + typedef TensorAssignOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { - TensorEvaluator evalDst(dst, DefaultDevice()); - TensorEvaluator evalSrc(src, Defaultevice()); - const Index size = dst.size(); - - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; - - int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); - } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } - - if (numblocks * blocksize < size) { - EvalRange, TensorEvaluator, Index>::run(evalDst, evalSrc, numblocks * blocksize, size); - } + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); } -}; -#endif - -// GPU: the evaluation of the expressions is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - evalDst.coeffRef(index) = evalSrc.coeff(index); -} -template -__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) { - const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; - if (index < size) { - evalDst.coeffRef(index) = evalSrc.coeff(index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_leftImpl.evalSubExprsIfNeeded(); + m_rightImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); } -} -template -struct TensorAssignGpu -{ - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - const int block_size = std::min(size, 32*32); - const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); - - const int remaining_items = size % block_size; - if (remaining_items > 0) { - const int peel_start_offset = num_blocks * block_size; - const int peel_block_size = std::min(size, 32); - const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); - } + return m_leftImpl.coeff(index); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; -#endif -} // end namespace internal +} -} // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cadbabda2..b2e12fd15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,6 +184,14 @@ struct TensorEvaluator class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -48,7 +51,10 @@ template class TensorDevice EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignMultiThreaded::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -67,13 +73,15 @@ template class TensorDevice template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignGpu::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); return *this; } protected: const GpuDevice& m_device; - ExpressionType& m_expression; + ExpressionType m_expression; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 000000000..db716a80e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template +class TensorEvalToOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + Scalar* m_buffer; +}; + + + +template +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 80fe06957..5c8b079da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -38,27 +38,32 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); return m_data[index]; } - template + template EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template + template EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -95,13 +100,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -137,13 +145,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_argImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -184,12 +199,21 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -230,12 +254,24 @@ struct TensorEvaluator // TODO: use then or else impl instead if they happen to be known at compile time. return m_condImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_condImpl.evalSubExprsIfNeeded(); + m_thenImpl.evalSubExprsIfNeeded(); + m_elseImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 000000000..3e41f3290 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,194 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +#ifdef EIGEN_USE_THREADS +#include +#endif + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template::PacketAccess> +struct TensorExecutor +{ + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last,) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits::size; + if (last - first > PacketSize) { + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + evaluator.evalPacket(i); + } + } + + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + TensorEvaluator single_threaded_eval(expr, DefaultDevice()); + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize)); + } + + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr); + } + + evaluator.cleanup(); + } +}; +#endif + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void EigenMetaKernelNoCheck(Evaluator eval) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + eval.evalScalar(index); +} +template +__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) { + const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + eval.evalScalar(index); + } +} + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + const int block_size = std::min(size, 32*32); + const int num_blocks = size / block_size; + EigenMetaKernelNoCheck > <<>>(evaluator); + + const int remaining_items = size % block_size; + if (remaining_items > 0) { + const int peel_start_offset = num_blocks * block_size; + const int peel_block_size = std::min(size, 32); + const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; + EigenMetaKernelPeel > <<>>(evaluator, peel_start_offset, size); + } + evaluator.cleanup(); + } +}; +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 789c04238..d42167da9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -200,7 +200,9 @@ class TensorFixedSize : public TensorBase::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 000000000..6f6641de6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +template +class TensorForcedEvalOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC ~TensorEvaluator() { + eigen_assert(!m_buffer); + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); + + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + m_impl.cleanup(); + } + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 27bfe1d73..c0dffbd0c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,13 +25,16 @@ template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; +template class TensorAssignOp; + +template class TensorEvalToOp; template class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template class TensorExecutor; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3a06170fa..c97135b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -246,7 +246,9 @@ template class TensorMap : public Tensor EIGEN_DEVICE_FUNC Self& operator=(const OtherDerived& other) { - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e9e74581f..764bba4e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -98,6 +98,13 @@ struct TensorEvaluator, Device> const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); -- cgit v1.2.3 From f80c8e17eb042fc95767417eeca26cd3fa0c6ad6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 10:12:12 -0700 Subject: Silenced a compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 58b1808a3..4bdf74286 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -208,9 +208,9 @@ struct TensorEvaluator m_indexStride; array m_kernelStride; - Dimensions m_dimensions; TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; }; -- cgit v1.2.3 From 774c3c1e0aca307e484b00997b735ee5964d96d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 10:20:28 -0700 Subject: Created additional unit tests for the tensor code and improved existing ones. --- unsupported/test/CMakeLists.txt | 3 ++ unsupported/test/cxx11_tensor_device.cpp | 28 +++++++++++- unsupported/test/cxx11_tensor_lvalue.cpp | 42 +++++++++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 72 ++++++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_lvalue.cpp create mode 100644 unsupported/test/cxx11_tensor_morphing.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 34130a192..7458128fb 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -105,7 +105,10 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") +# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_morphing "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 365b109c7..caf2e9735 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -65,6 +65,12 @@ static void test_contextual_eval(Context* context) context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } +template +static void test_forced_contextual_eval(Context* context) +{ + context->out() = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + static void test_cpu() { Eigen::Tensor in1(Eigen::array(2,3,7)); Eigen::Tensor in2(Eigen::array(2,3,7)); @@ -72,9 +78,9 @@ static void test_cpu() { in1.setRandom(); in2.setRandom(); + CPUContext context(in1, in2, out); test_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -82,6 +88,15 @@ static void test_cpu() { } } } + + test_forced_contextual_eval(&context); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + } + } + } } static void test_gpu() { @@ -111,7 +126,6 @@ static void test_gpu() { GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -120,6 +134,16 @@ static void test_gpu() { } } } + + test_forced_contextual_eval(&context); + cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + } + } + } } diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp new file mode 100644 index 000000000..071f5b406 --- /dev/null +++ b/unsupported/test/cxx11_tensor_lvalue.cpp @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_compound_assignment() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + mat3 = mat1; + mat3 += mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_lvalue() +{ + CALL_SUBTEST(test_compound_assignment()); +} diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp new file mode 100644 index 000000000..21af9e0b5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -0,0 +1,72 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_reshape() +{ + Tensor tensor1(2,3,1,7,1); + tensor1.setRandom(); + + Tensor tensor2(2,3,7); + Tensor tensor3(6,7); + Tensor tensor4(2,21); + + Tensor::Dimensions dim1{{2,3,7}}; + tensor2 = tensor1.reshape(dim1); + Tensor::Dimensions dim2{{6,7}}; + tensor3 = tensor1.reshape(dim2); + Tensor::Dimensions dim3{{2,21}}; + tensor4 = tensor1.reshape(dim1).reshape(dim3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); + } + } + } +} + + +static void test_reshape_in_expr() { + MatrixXf m1(2,3*5*7*11); + MatrixXf m2(3*5*7*11,13); + m1.setRandom(); + m2.setRandom(); + MatrixXf m3 = m1 * m2; + + TensorMap> tensor1(m1.data(), 2,3,5,7,11); + TensorMap> tensor2(m2.data(), 3,5,7,11,13); + Tensor::Dimensions newDims1{{2,3*5*7*11}}; + Tensor::Dimensions newDims2{{3*5*7*11,13}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; + Tensor tensor3(2,13); + tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); + + Map res(tensor3.data(), 2, 13); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 13; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } +} + +void test_cxx11_tensor_morphing() +{ + CALL_SUBTEST(test_simple_reshape()); + CALL_SUBTEST(test_reshape_in_expr()); +} -- cgit v1.2.3 From 47981c5925caa8316205ea84b17616dd69073678 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:07:57 -0700 Subject: Added support for tensor slicing --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 343 ++++++++++++++++++++- 1 file changed, 327 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 764bba4e6..55954a3a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -20,10 +20,9 @@ namespace Eigen { * */ namespace internal { -template -struct traits > : public traits +template +struct traits > : public traits { - // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; typedef typename internal::packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; @@ -32,24 +31,24 @@ struct traits > : public traits::type _Nested; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorReshapingOp type; + typedef TensorReshapingOp type; }; } // end namespace internal -template -class TensorReshapingOp : public TensorBase > +template +class TensorReshapingOp : public TensorBase, WriteAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -71,16 +70,27 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + protected: typename XprType::Nested m_xpr; const NewDimensions m_dims; }; -template -struct TensorEvaluator, Device> +// Eval as rvalue +template +struct TensorEvaluator, Device> { - typedef TensorReshapingOp XprType; + typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; enum { @@ -88,7 +98,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { } @@ -96,7 +106,7 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { m_impl.evalSubExprsIfNeeded(); @@ -116,12 +126,313 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + protected: + NewDimensions m_dimensions; + TensorEvaluator m_impl; +}; + + +// Eval as lvalue +// TODO(bsteiner): share the code with the evaluator for rvalue reshapes. +template +struct TensorEvaluator, Device> +{ + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return m_impl.coeffRef(index); + } + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + m_impl.template writePacket(index, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + private: NewDimensions m_dimensions; TensorEvaluator m_impl; }; +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSlicingOp type; +}; + +} // end namespace internal + + + +template +class TensorSlicingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } else { + m_inputStrides[0] = 1; + } + } + + const Sizes& output_dims = op.sizes(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + } else { + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + private: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + const StartIndices m_offsets; + TensorEvaluator m_impl; +}; + + +// Eval as lvalue +// TODO(bsteiner): share the code with the evaluator for rvalue slices. +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } else { + m_inputStrides[0] = 1; + } + } + + const Sizes& output_dims = op.sizes(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + } else { + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeffRef(inputIndex); + } + + private: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + const StartIndices m_offsets; + TensorEvaluator m_impl; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H -- cgit v1.2.3 From bc072c5cba4cb6e9e7a6fd5f1e8f0e1231203223 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:08:45 -0700 Subject: Added support for tensor slicing --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 6b53d2a3d..527d47c57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,11 +204,16 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } - // Morphing operators (slicing tbd). + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp + const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); } // Force the evaluation of the expression. @@ -257,6 +262,17 @@ class TensorBase : public TensorBase, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { -- cgit v1.2.3 From 7d53633e05986c61ce90e7fc36862d529c0cc036 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:10:36 -0700 Subject: Added support for tensor slicing --- unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index c0dffbd0c..5d6e7776a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,6 +25,7 @@ template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; +template class TensorSlicingOp; template class TensorAssignOp; template class TensorEvalToOp; -- cgit v1.2.3 From c285fda7f40ca161e6c8e66481d9a68e50613c48 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:30:48 -0700 Subject: Extended the functionality of the TensorDeviceType classes --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 59 ++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 142edda14..b9c8c19fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -21,6 +21,12 @@ struct DefaultDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } }; @@ -28,7 +34,7 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { @@ -37,6 +43,12 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } private: // todo: NUMA, ... @@ -47,20 +59,61 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU +static int m_numMultiProcessors = 0; +static int m_maxThreadsPerBlock = 0; +static int m_maxThreadsPerMultiProcessor = 0; + +static inline int getNumCudaMultiProcessors() { + if (m_numMultiProcessors == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_numMultiProcessors = deviceProp.multiProcessorCount; + } + return m_numMultiProcessors; +} +static inline int maxCudaThreadsPerBlock() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + } + return m_maxThreadsPerBlock; +} +static inline int maxCudaThreadsPerMultiProcessor() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + } + return m_maxThreadsPerMultiProcessor; +} + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { void* result; cudaMalloc(&result, num_bytes); return result; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const { cudaFree(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + cudaMemsetAsync(buffer, c, n, *stream_); + } private: // TODO: multigpu. -- cgit v1.2.3 From cc1bacea5b6b532728a001f8cfcf762e5385dcef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:39:28 -0700 Subject: Improved the efficiency of the tensor evaluation code on thread pools and gpus. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 50 ++++++++-------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 3e41f3290..f50f839fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -77,17 +77,17 @@ struct TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator& evaluator, const Index first, const Index last) { + static void run(Evaluator* evaluator, const Index first, const Index last) { eigen_assert(last > first); for (Index i = first; i < last; ++i) { - evaluator.evalScalar(i); + evaluator->evalScalar(i); } } }; template struct EvalRange { - static void run(Evaluator& evaluator, const Index first, const Index last,) { + static void run(Evaluator* evaluator, const Index first, const Index last) { eigen_assert(last > first); Index i = first; @@ -96,12 +96,12 @@ struct EvalRange { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { - evaluator.evalPacket(i); + evaluator->evalPacket(i); } } for (; i < last; ++i) { - evaluator.evalScalar(i); + evaluator->evalScalar(i); } } }; @@ -112,24 +112,23 @@ struct TensorExecutor typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { - TensorEvaluator evaluator(expr, device); + typedef TensorEvaluator Evaluator; + Evaluator evaluator(expr, device); evaluator.evalSubExprsIfNeeded(); const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - TensorEvaluator single_threaded_eval(expr, DefaultDevice()); - Index i = 0; vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize)); + results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -137,7 +136,7 @@ struct TensorExecutor } if (numblocks * blocksize < size) { - EvalRange, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr); + EvalRange::run(&evaluator, numblocks * blocksize, size); } evaluator.cleanup(); @@ -149,15 +148,11 @@ struct TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernelNoCheck(Evaluator eval) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - eval.evalScalar(index); -} -template -__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) { - const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; - if (index < size) { - eval.evalScalar(index); +__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; + const int step_size = blockDim.x * gridDim.x; + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } } @@ -169,19 +164,12 @@ struct TensorExecutor { TensorEvaluator evaluator(expr, device); evaluator.evalSubExprsIfNeeded(); + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); const Index size = evaluator.dimensions().TotalSize(); - const int block_size = std::min(size, 32*32); - const int num_blocks = size / block_size; - EigenMetaKernelNoCheck > <<>>(evaluator); - - const int remaining_items = size % block_size; - if (remaining_items > 0) { - const int peel_start_offset = num_blocks * block_size; - const int peel_block_size = std::min(size, 32); - const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel > <<>>(evaluator, peel_start_offset, size); - } + EigenMetaKernel > <<>>(evaluator, size); + eigen_assert(cudaGetLastError() == cudaSuccess); evaluator.cleanup(); } }; -- cgit v1.2.3 From ea0906dfd877b3be91b5b0a28d2040ec360b1d3a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:43:28 -0700 Subject: Improved evaluation of tensor expressions when used as rvalues --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 46 +++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 5c8b079da..ac9829ce9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -23,6 +23,7 @@ namespace Eigen { * leading to lvalues (slicing, reshaping, etc...) */ +// Generic evaluator template struct TensorEvaluator { @@ -38,7 +39,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } @@ -75,6 +76,49 @@ struct TensorEvaluator }; +// Default evaluator for rvalues +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) + : m_data(m.data()), m_dims(m.dimensions()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + template EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt(m_data + index); + } + + protected: + const Scalar* m_data; + Dimensions m_dims; +}; + + + // -------------------- CwiseNullaryOp -------------------- -- cgit v1.2.3 From 25b2f6624d092ed99d0c4936de0c83c9ea4a024d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Jul 2014 12:48:34 -0700 Subject: Improved the speed of slicing operations. --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 55954a3a7..f6f67afa7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -320,11 +320,12 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeff(inputIndex); } @@ -399,11 +400,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeff(inputIndex); } @@ -416,11 +418,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeffRef(inputIndex); } -- cgit v1.2.3 From ffd3654f6738bab79db010e02cd67660ecca62c1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:09:46 -0700 Subject: Vectorized the evaluation of expressions involving tensor slices. --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 108 +++++++++++++++++++-- 1 file changed, 98 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index f6f67afa7..3b42c8514 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -273,8 +273,10 @@ struct TensorEvaluator, Devi static const int NumDims = internal::array_size::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -329,11 +331,40 @@ struct TensorEvaluator, Devi return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } private: Dimensions m_dimensions; @@ -353,8 +384,8 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -409,11 +440,38 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { @@ -427,6 +485,36 @@ struct TensorEvaluator, Device> return m_impl.coeffRef(inputIndex); } + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + m_impl.template writePacket(inputIndices[0], x); + } + else { + CoeffReturnType values[packetSize]; + internal::pstore(values, x); + m_impl.coeffRef(inputIndices[0]) = values[0]; + m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + coeffRef(index+i) = values[i]; + } + } + } + private: Dimensions m_dimensions; array m_outputStrides; -- cgit v1.2.3 From 9b7a6f0122f6817a3c12bc75803d4270cd9db507 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:27:27 -0700 Subject: Added tests for tensor slicing --- unsupported/test/cxx11_tensor_morphing.cpp | 132 ++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 21af9e0b5..fbfdaadb7 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -52,8 +52,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - typedef Tensor::DimensionPair DimPair; - array contract_along{{DimPair(1, 0)}}; + array::DimensionPair, 1> contract_along{{1, 0}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -65,8 +64,137 @@ static void test_reshape_in_expr() { } } + +static void test_reshape_as_lvalue() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2d(6,7); + Tensor::Dimensions dim{{2,3,7}}; + tensor2d.reshape(dim) = tensor; + + Tensor tensor5d(2,3,1,7,1); + tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); + VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); + } + } + } +} + + +static void test_simple_slice() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor slice1(1,1,1,1,1); + Eigen::DSizes indices(Eigen::array(1,2,3,4,5)); + Eigen::DSizes sizes(Eigen::array(1,1,1,1,1)); + slice1 = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + + Tensor slice2(1,1,2,2,3); + Eigen::DSizes indices2(Eigen::array(1,1,3,4,5)); + Eigen::DSizes sizes2(Eigen::array(1,1,2,2,3)); + slice2 = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } +} + + +static void test_slice_in_expr() { + MatrixXf m1(7,7); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + MatrixXf m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1); + + TensorMap> tensor1(m1.data(), 7, 7); + TensorMap> tensor2(m2.data(), 3, 3); + Tensor tensor3(3,1); + array::DimensionPair, 1> contract_along{{1, 0}}; + + Eigen::DSizes indices1(Eigen::array(1,2)); + Eigen::DSizes sizes1(Eigen::array(3,3)); + Eigen::DSizes indices2(Eigen::array(0,2)); + Eigen::DSizes sizes2(Eigen::array(3,1)); + tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along); + + Map res(tensor3.data(), 3, 1); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 1; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } +} + + +static void test_slice_as_lvalue() +{ + Tensor tensor1(2,2,7); + tensor1.setRandom(); + Tensor tensor2(2,2,7); + tensor2.setRandom(); + Tensor tensor3(4,3,5); + tensor3.setRandom(); + Tensor tensor4(4,3,2); + tensor4.setRandom(); + + Tensor result(4,5,7); + Eigen::DSizes sizes12(Eigen::array(2,2,7)); + Eigen::DSizes first_slice(Eigen::array(0,0,0)); + result.slice(first_slice, sizes12) = tensor1; + Eigen::DSizes second_slice(Eigen::array(2,0,0)); + result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2; + + Eigen::DSizes sizes3(Eigen::array(4,3,5)); + Eigen::DSizes third_slice(Eigen::array(0,2,0)); + result.slice(third_slice, sizes3) = tensor3; + + Eigen::DSizes sizes4(Eigen::array(4,3,2)); + Eigen::DSizes fourth_slice(Eigen::array(0,2,5)); + result.slice(fourth_slice, sizes4) = tensor4; + + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k)); + VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k)); + } + } + } + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k)); + } + for (int k = 5; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5)); + } + } + } +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); CALL_SUBTEST(test_reshape_in_expr()); + CALL_SUBTEST(test_reshape_as_lvalue()); + + CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_slice_in_expr()); + CALL_SUBTEST(test_slice_as_lvalue()); } -- cgit v1.2.3 From 40bb98e76acbe6e077903e15896c100ee6cced39 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:29:51 -0700 Subject: Added primitives to compare tensor dimensions --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 54 ++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3e5687915..3b169a06f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -210,6 +210,60 @@ struct DSizes : array { }; +namespace internal { + +template struct array_size > { + static const size_t value = NumDims; +}; +template struct array_size > { + static const size_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template struct array_size > { +static const size_t value = Sizes::count; +}; +template struct array_size > { +static const size_t value = Sizes::count; +}; +#else +template struct array_size > { + static const size_t value = Sizes::count; +}; +template struct array_size > { + static const size_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { + return get::Base>::value; +}; + +#endif + + +template +struct sizes_match_up_to_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) & + sizes_match_up_to_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_up_to_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get<0>(dims1) == array_get<0>(dims2)); + } +}; + +template +bool dimensions_match(Dims1& dims1, Dims2& dims2) { + if (array_size::value != array_size::value) { + return false; + } + return sizes_match_up_to_dim::value-1>::run(dims1, dims2); +} + +} // end namespace internal + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H -- cgit v1.2.3 From f7bb7ee3f36474163da7c7f6f88306d553238df2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Jul 2014 10:31:21 -0700 Subject: Fixed the assignment operator of the Tensor and TensorMap classes. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 11 +++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 9 ++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 09601fc7d..547bb74d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -229,6 +229,17 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + // FIXME: we need to resize the tensor to fix the dimensions of the other. + // Unfortunately this isn't possible yet when the rhs is an expression. + // resize(other.dimensions()); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index c97135b63..417717b90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -241,9 +241,16 @@ template class TensorMap : public Tensor } #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; -- cgit v1.2.3 From 1f371e78e659d6e5fd781aea93b6b9c7a0604aeb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Jul 2014 10:32:40 -0700 Subject: Added a few tests to validate the behavior of the assignment operator. --- unsupported/test/cxx11_tensor_assign.cpp | 43 ++++++++++++++++++++++++++++++++ unsupported/test/cxx11_tensor_simple.cpp | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index c88872950..b024bed19 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -186,10 +186,53 @@ static void test_3d() } } +static void test_same_type() +{ + Tensor orig_tensor(5); + Tensor dest_tensor(5); + orig_tensor.setRandom(); + dest_tensor.setRandom(); + int* orig_data = orig_tensor.data(); + int* dest_data = dest_tensor.data(); + dest_tensor = orig_tensor; + VERIFY_IS_EQUAL(orig_tensor.data(), orig_data); + VERIFY_IS_EQUAL(dest_tensor.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i)); + } + + TensorFixedSize > orig_array; + TensorFixedSize > dest_array; + orig_array.setRandom(); + dest_array.setRandom(); + orig_data = orig_array.data(); + dest_data = dest_array.data(); + dest_array = orig_array; + VERIFY_IS_EQUAL(orig_array.data(), orig_data); + VERIFY_IS_EQUAL(dest_array.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_array(i), orig_array(i)); + } + + int orig[5] = {1, 2, 3, 4, 5}; + int dest[5] = {6, 7, 8, 9, 10}; + TensorMap > orig_map(orig, 5); + TensorMap > dest_map(dest, 5); + orig_data = orig_map.data(); + dest_data = dest_map.data(); + dest_map = orig_map; + VERIFY_IS_EQUAL(orig_map.data(), orig_data); + VERIFY_IS_EQUAL(dest_map.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest[i], i+1); + } +} + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_same_type()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 1f76033ea..1455f2a4c 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -244,7 +244,7 @@ static void test_simple_assign() epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1; epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1; - Tensor e2(2,3,1); + Tensor e2(3,3,3); e2.setZero(); VERIFY_IS_EQUAL((e2(1,2,0)), 0); -- cgit v1.2.3 From 2116e261fb27c795d153f171467cf7912ff3eec5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Jul 2014 09:47:59 -0700 Subject: Made sure that the data stored in fixed sized tensor is aligned. --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index c9d6517eb..0c4f8a3d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -38,7 +38,7 @@ template Date: Thu, 31 Jul 2014 17:39:04 -0700 Subject: The tensor assignment code now resizes the destination tensor as needed. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 547bb74d1..fdbe8df4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -79,6 +79,7 @@ class Tensor : public TensorBase > }; static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; typedef DSizes Dimensions; @@ -232,11 +233,9 @@ class Tensor : public TensorBase > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { - // FIXME: we need to resize the tensor to fix the dimensions of the other. - // Unfortunately this isn't possible yet when the rhs is an expression. - // resize(other.dimensions()); typedef TensorAssignOp Assign; Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } @@ -244,11 +243,9 @@ class Tensor : public TensorBase > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) { - // FIXME: we need to resize the tensor to fix the dimensions of the other. - // Unfortunately this isn't possible yet when the rhs is an expression. - // resize(other.dimensions()); typedef TensorAssignOp Assign; Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } -- cgit v1.2.3 From 439feca139a093292923e14c085352e5dd2239a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:22:05 -0700 Subject: Reworked the TensorExecutor code to support in place evaluation. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 114 +++++++++++---------- 1 file changed, 62 insertions(+), 52 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f50f839fc..d6e2ab1a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include +#include " #endif namespace Eigen { @@ -28,45 +28,49 @@ namespace internal { // Default strategy: the expression is evaluated with a single cpu thread. template::PacketAccess> -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; @@ -107,38 +111,40 @@ struct EvalRange { }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; Evaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); - } + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + + static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; + + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + } - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } - if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } } - evaluator.cleanup(); } }; @@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { } template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); - const int block_size = maxCudaThreadsPerBlock(); - - const Index size = evaluator.dimensions().TotalSize(); - EigenMetaKernel > <<>>(evaluator, size); - eigen_assert(cudaGetLastError() == cudaSuccess); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + + const Index size = evaluator.dimensions().TotalSize(); + EigenMetaKernel > <<>>(evaluator, size); + assert(cudaGetLastError() == cudaSuccess); + } evaluator.cleanup(); } }; -- cgit v1.2.3 From b1892ab14d8ac94bef233d0cef0ef7df1e9a592e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:26:44 -0700 Subject: Added suppor for in place evaluation to simple tensor expressions. Use mempy to speedup tensor copies whenever possible. --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 12 +++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 55 ++++++++++++++++------ 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index a2a925775..3bfe80c9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -102,6 +102,7 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -112,9 +113,14 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_leftImpl.evalSubExprsIfNeeded(); - m_rightImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(internal::dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ac9829ce9..0f969036c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -39,13 +39,20 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) - : m_data(const_cast(m.data())), m_dims(m.dimensions()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -70,9 +77,12 @@ struct TensorEvaluator return internal::pstoret(m_data + index, x); } + Scalar* data() const { return m_data; } + protected: Scalar* m_data; Dimensions m_dims; + const Device& m_device; }; @@ -98,7 +108,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -112,6 +122,8 @@ struct TensorEvaluator return internal::ploadt(m_data + index); } + const Scalar* data() const { return m_data; } + protected: const Scalar* m_data; Dimensions m_dims; @@ -138,13 +150,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -158,6 +171,8 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } + Scalar* data() const { return NULL; } + private: const NullaryOp m_functor; TensorEvaluator m_argImpl; @@ -183,14 +198,16 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_argImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); @@ -207,6 +224,8 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: const UnaryOp m_functor; TensorEvaluator m_argImpl; @@ -233,6 +252,7 @@ struct TensorEvaluator::Dimensions Dimensions; @@ -243,9 +263,10 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: const BinaryOp m_functor; TensorEvaluator m_leftImpl; @@ -289,6 +312,7 @@ struct TensorEvaluator { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -299,10 +323,11 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_condImpl.evalSubExprsIfNeeded(); - m_thenImpl.evalSubExprsIfNeeded(); - m_elseImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); @@ -327,6 +352,8 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; -- cgit v1.2.3 From 1aa2bf82741f2f51fbf0a29ff95e0d017f6962a3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:27:58 -0700 Subject: Support for in place evaluation of expressions containing slicing and reshaping operations --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 30 ++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3b42c8514..2b1b503cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -103,13 +103,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -126,6 +127,8 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + Scalar* data() const { return NULL; } + protected: NewDimensions m_dimensions; TensorEvaluator m_impl; @@ -150,13 +153,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -182,6 +186,8 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + Scalar* data() const { return NULL; } + private: NewDimensions m_dimensions; TensorEvaluator m_impl; @@ -306,14 +312,16 @@ struct TensorEvaluator, Devi } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -366,6 +374,8 @@ struct TensorEvaluator, Devi } } + Scalar* data() const { return NULL; } + private: Dimensions m_dimensions; array m_outputStrides; @@ -415,14 +425,16 @@ struct TensorEvaluator, Device> } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -515,6 +527,8 @@ struct TensorEvaluator, Device> } } + Scalar* data() const { return NULL; } + private: Dimensions m_dimensions; array m_outputStrides; -- cgit v1.2.3 From 72e75297089e7c141108696195763c024571974d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:29:40 -0700 Subject: Fixed a typo. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d6e2ab1a2..faf965df8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include " +#include #endif namespace Eigen { -- cgit v1.2.3 From f8fad09301106c574ed88ffde52e15483d14673f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:33:18 -0700 Subject: Updated the convolution and contraction evaluators to follow the new EvalSubExprsIfNeeded apu. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2e12fd15..8d7a1351e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,9 +184,10 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); @@ -151,11 +151,12 @@ struct TensorEvaluator Date: Wed, 13 Aug 2014 08:36:33 -0700 Subject: Added missing apis. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8d7a1351e..b2969337f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -173,7 +173,7 @@ struct TensorEvaluator::value> m_leftOffsets; array::value> m_rightOffsets; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 8864c5329..e3068dcae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -148,6 +148,7 @@ struct TensorEvaluator m_inputStride; array m_outputStride; -- cgit v1.2.3 From f1d8c13dbcbe38938dcd727f9b50339a981197c3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:40:26 -0700 Subject: Fixed misc typos. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2969337f..897d73806 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,7 +184,7 @@ struct TensorEvaluator Date: Wed, 13 Aug 2014 08:44:47 -0700 Subject: Added ability to get the nth element from an abstract array type. --- unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 1d3164d6a..4c6b95773 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -181,6 +181,15 @@ array repeat(t v) { return array; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list& a) { + return get >::value; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list& a) { + return get >::value; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -196,8 +205,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; } -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const T& array_get(const array& a) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } -- cgit v1.2.3 From eeb43f9e2b7ac56af685d8fc494685df8227a53f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:22:47 -0700 Subject: Added support for padding, stridding, and shuffling --- unsupported/Eigen/CXX11/Tensor | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 15 ++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 163 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 168 ++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorStriding.h | 172 +++++++++++++++++++++ 6 files changed, 524 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7e504b302..0775d440a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,6 +42,9 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 527d47c57..0295fcdbc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -215,6 +215,21 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } // Force the evaluation of the expression. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 5d6e7776a..baa5968bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -26,6 +26,9 @@ template class template class TensorConvolutionOp; template class TensorReshapingOp; template class TensorSlicingOp; +template class TensorPaddingOp; +template class TensorShufflingOp; +template class TensorStridingOp; template class TensorAssignOp; template class TensorEvalToOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 000000000..45558d7dd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,163 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only 0-padding is supported. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPaddingOp type; +}; + +} // end namespace internal + + + +template +class TensorPaddingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims) + : m_xpr(expr), m_padding_dims(padding_dims) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()) + { + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + PaddingDimensions m_padding; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 000000000..4dfc99203 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorShufflingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorShufflingOp type; +}; + +} // end namespace internal + + + +template +class TensorShufflingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle) + : m_xpr(expr), m_shuffle(shuffle) {} + + EIGEN_DEVICE_FUNC + const Shuffle& shuffle() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_shuffle(op.shuffle()) + { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = input_dims[m_shuffle[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[m_shuffle[i]]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[m_shuffle[0]]; + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + Shuffle m_shuffle; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 000000000..7acdbfc72 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingOp type; +}; + +} // end namespace internal + + + +template +class TensorStridingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] *= op.strides()[i]; + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + // Strides m_strides; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H -- cgit v1.2.3 From 8c8db49331a89236be7fdf045279504dd7d1797a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:25:22 -0700 Subject: Added a few regression tests --- unsupported/test/CMakeLists.txt | 3 + unsupported/test/cxx11_tensor_padding.cpp | 54 +++++++++++++ unsupported/test/cxx11_tensor_shuffling.cpp | 116 ++++++++++++++++++++++++++++ unsupported/test/cxx11_tensor_striding.cpp | 71 +++++++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_padding.cpp create mode 100644 unsupported/test/cxx11_tensor_shuffling.cpp create mode 100644 unsupported/test/cxx11_tensor_striding.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 406564673..cd2063848 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -109,6 +109,9 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") + ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_shuffling "-std=c++0x") + ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp new file mode 100644 index 000000000..d93bb1883 --- /dev/null +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_padding() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = make_pair(0, 0); + paddings[1] = make_pair(2, 1); + paddings[2] = make_pair(3, 4); + paddings[3] = make_pair(0, 0); + + Tensor padded; + padded = tensor.pad(paddings); + + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(tensor(i,j-2,k-3,l), padded(i,j,k,l)); + } else { + VERIFY_IS_EQUAL(0.0f, padded(i,j,k,l)); + } + } + } + } + } +} + + +void test_cxx11_tensor_padding() +{ + CALL_SUBTEST(test_simple_padding()); +} diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp new file mode 100644 index 000000000..92dd01a52 --- /dev/null +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_shuffling() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + +static void test_expr_shuffling() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array shuffles; + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor expected; + expected = tensor.shuffle(shuffles); + + Tensor result(5,7,3,2); + + array src_slice_dim(Eigen::array(2,3,1,7)); + array src_slice_start(Eigen::array(0,0,0,0)); + array dst_slice_dim(Eigen::array(1,7,3,2)); + array dst_slice_start(Eigen::array(0,0,0,0)); + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles); + src_slice_start[2] += 1; + dst_slice_start[0] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 3); + VERIFY_IS_EQUAL(result.dimension(3), 2); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_shuffling() +{ + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); +} diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp new file mode 100644 index 000000000..502569d1d --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -0,0 +1,71 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_striding() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + Tensor no_stride; + no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + Tensor stride; + stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 1; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_striding() +{ + CALL_SUBTEST(test_simple_striding()); +} -- cgit v1.2.3 From 756292f8aa124c842d1e6d9beeb0c416c0d9a7f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:32:59 -0700 Subject: Fixed compilation errors --- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_padding.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cd2063848..520935105 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,7 +110,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") - ei_add_test(cxx11_tensor_shuffling "-std=c++0x") +# ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index d93bb1883..cb010f512 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -18,11 +18,11 @@ static void test_simple_padding() Tensor tensor(2,3,5,7); tensor.setRandom(); - array, 4> paddings; - paddings[0] = make_pair(0, 0); - paddings[1] = make_pair(2, 1); - paddings[2] = make_pair(3, 4); - paddings[3] = make_pair(0, 0); + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); Tensor padded; padded = tensor.pad(paddings); -- cgit v1.2.3 From 33c702c79fe227a5b22229c26af276d359a6cb1d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 22:13:21 -0700 Subject: Added support for fast integer divisions by a constant Sped up tensor slicing by a factor of 3 by using these fast integer divisions. --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 82 ++++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 26 ++++--- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_intdiv.cpp | 77 ++++++++++++++++++++ 5 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h create mode 100644 unsupported/test/cxx11_tensor_intdiv.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0775d440a..82552c3c2 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -34,6 +34,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 000000000..cf97031be --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at http://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +template +struct TensorIntDivisor { + public: + TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 1 <= divider <= 2^31-1 + TensorIntDivisor(const T divider) { + static const int N = 32; + eigen_assert(divider > 0); + eigen_assert(divider <= (1<<(N-1)) - 1); + + // fast ln2 + const int leading_zeros = __builtin_clz(divider); + const int l = N - (leading_zeros+1); + + multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; + shift1 = (std::min)(1, l); + shift2 = (std::max)(0, l-1); + } + + // Must have 0 <= numerator <= 2^32-1 + T divide(const T numerator) const { + static const int N = 32; + eigen_assert(numerator >= 0); + eigen_assert(numerator <= (1ull<> 32; + uint32_t t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + uint64_t multiplier; + int32_t shift1; + int32_t shift2; +}; + + +template +static T operator / (const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2b1b503cf..ca3735d64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -305,8 +305,10 @@ struct TensorEvaluator, Devi for (int i = 0; i < NumDims; ++i) { if (i > 0) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } else { m_outputStrides[0] = 1; + m_fastOutputStrides[0] = 1; } } } @@ -331,7 +333,7 @@ struct TensorEvaluator, Devi { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -349,8 +351,8 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -379,6 +381,7 @@ struct TensorEvaluator, Devi private: Dimensions m_dimensions; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; @@ -418,9 +421,11 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumDims; ++i) { if (i > 0) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } else { m_outputStrides[0] = 1; - } + m_fastOutputStrides[0] = 1; + } } } @@ -444,7 +449,7 @@ struct TensorEvaluator, Device> { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -460,8 +465,8 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -489,7 +494,7 @@ struct TensorEvaluator, Device> { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -504,8 +509,8 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -532,6 +537,7 @@ struct TensorEvaluator, Device> private: Dimensions m_dimensions; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 520935105..e2204827e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,6 +106,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp new file mode 100644 index 000000000..a510dc695 --- /dev/null +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_signed_32bit() +{ + for (int32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (int32_t j = 0; j < 25000; ++j) { + const int32_t fast_div = j / div; + const int32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_32bit() +{ + for (uint32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (uint32_t j = 0; j < 25000; ++j) { + const uint32_t fast_div = j / div; + const uint32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_signed_64bit() +{ + for (int64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (int64_t j = 0; j < 25000; ++j) { + const int64_t fast_div = j / div; + const int64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_64bit() +{ + for (uint64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (uint64_t j = 0; j < 25000; ++j) { + const uint64_t fast_div = j / div; + const uint64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +void test_cxx11_tensor_intdiv() +{ + CALL_SUBTEST(test_signed_32bit()); + CALL_SUBTEST(test_unsigned_32bit()); + CALL_SUBTEST(test_signed_64bit()); + CALL_SUBTEST(test_unsigned_64bit()); +} -- cgit v1.2.3 From 9ac3c821ea3b956634116bcdf80bfab7d9a00d91 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Aug 2014 16:57:10 -0700 Subject: Improved the speed of convolutions when running on cuda devices --- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 632 ++++++++++++++++++++- 1 file changed, 622 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4158271c3..7d0a21c3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -20,6 +20,126 @@ namespace Eigen { * */ namespace internal { + + +template class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } else { + inputStrides[0] = 1; + outputStrides[0] = 1; + } + } + + array cudaInputDimensions; + array cudaOutputDimensions; + array tmp = dimensions; + array ordering; + for (int i = 0; i < NumKernelDims; ++i) { + ordering[i] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[i] = input_dims[ordering[i]]; + cudaOutputDimensions[i] = dimensions[ordering[i]]; + } + int written = NumKernelDims; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; + m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + return i * m_inputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + return i * m_outputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + return i * m_inputStrides[0] + j*m_inputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + return i * m_outputStrides[0] + j * m_outputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + } + + private: + static const size_t NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_cudaInputStrides; + array m_cudaOutputStrides; +}; + + + template struct traits > { @@ -75,15 +195,15 @@ class TensorConvolutionOp : public TensorBase::type& inputExpression() const { return m_input_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& kernelExpression() const { return m_kernel_xpr; } @@ -99,8 +219,8 @@ struct TensorEvaluator XprType; - static const int NumDims = TensorEvaluator::Dimensions::count; - static const int KernelDims = internal::array_size::value; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; typedef typename XprType::Index Index; typedef DSizes Dimensions; @@ -111,7 +231,7 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -124,7 +244,8 @@ struct TensorEvaluator m_inputStride; array m_outputStride; - array m_indexStride; - array m_kernelStride; + array m_indexStride; + array m_kernelStride; TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; Dimensions m_dimensions; }; + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + + + + +template +__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + + +template +__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + + +template +__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = sharedMemPerBlock(); + const int maxThreadsPerBlock = maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + + int maxX; + dim3 block_size; + if (m_indices[0] == 0) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = (std::min)(inner_dim, numX); + const int maxP = (std::min)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = (std::min)(maxThreadsPerBlock, maxX); + block_size.y = (std::min)(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = (std::min)(inner_dim, numP); + maxX = (std::min)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = (std::min)(warpSize, maxX); + block_size.y = (std::min)(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, min(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + EigenConvolutionKernel1D, Index, InputDims, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + EigenConvolutionKernel1D, Index, InputDims, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + EigenConvolutionKernel1D, Index, InputDims, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 2: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = (std::min)(inner_dim, numX); + const int maxY = (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = (std::min)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = (std::min)(1024, maxX); + block_size.y = (std::min)(1024/block_size.x, maxY); + block_size.z = (std::min)(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, min(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0], m_indices[1]); + const array kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + EigenConvolutionKernel2D, Index, InputDims, 4, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, 4, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + EigenConvolutionKernel2D, Index, InputDims, 7, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, 7, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 3: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + const int kernel_size_z = m_kernelImpl.dimensions()[2]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numZ = dimensions()[m_indices[2]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = (std::min)(32, maxX); + block_size.y = (std::min)(32, maxY); + block_size.z = (std::min)(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[0], m_indices[1], m_indices[2]); + const array kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + EigenConvolutionKernel3D, Index, InputDims> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + default: { + assert(false && "not supported yet"); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + assert(m_buf); + assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H -- cgit v1.2.3 From 3d298da2696ac956a430f6fbef93bf65ada0d304 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Aug 2014 17:00:50 -0700 Subject: Added support for broadcasting --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 + .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 186 +++++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_broadcasting.cpp | 114 +++++++++++++ 6 files changed, 309 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h create mode 100644 unsupported/test/cxx11_tensor_broadcasting.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 82552c3c2..ebe6419e8 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,6 +42,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0295fcdbc..da5148a5b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,6 +204,12 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp(derived(), broadcast); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 000000000..3b2a9c8b9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,186 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorBroadcastingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorBroadcastingOp type; +}; + +} // end namespace internal + + + +template +class TensorBroadcastingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index % m_impl.dimensions()[0]); + return m_impl.coeff(inputIndex); + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + const Index innermostLoc = index % m_impl.dimensions()[0]; + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeff(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index baa5968bc..afbcc9486 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -22,6 +22,7 @@ template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; template class TensorReductionOp; +template class TensorBroadcastingOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e2204827e..164388746 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -109,6 +109,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp new file mode 100644 index 000000000..9663912a4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -0,0 +1,114 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_broadcasting() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 1; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 1; + + Tensor no_broadcast; + no_broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2); + VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3); + VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l)); + } + } + } + } + + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 1; + broadcasts[3] = 4; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 4); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 28); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 28; ++l) { + VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l)); + } + } + } + } +} + + +static void test_vectorized_broadcasting() +{ + Tensor tensor(8,3,5); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; + + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_broadcasting() +{ + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); +} -- cgit v1.2.3 From fb5c1e9097886616d40a0988af5ca706292e54eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 23 Aug 2014 13:18:30 -0700 Subject: Optimized and cleaned up the tensor morphing code --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 218 ++++++--------------- 1 file changed, 63 insertions(+), 155 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index ca3735d64..d9a6b3f1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -127,7 +127,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - Scalar* data() const { return NULL; } + Scalar* data() const { return m_impl.data(); } protected: NewDimensions m_dimensions; @@ -136,10 +136,12 @@ struct TensorEvaluator, Device> // Eval as lvalue -// TODO(bsteiner): share the code with the evaluator for rvalue reshapes. template -struct TensorEvaluator, Device> + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + { + typedef TensorEvaluator, Device> Base; typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; @@ -149,7 +151,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + : Base(op, device) { } typedef typename XprType::Index Index; @@ -157,40 +159,15 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return m_impl.coeffRef(index); + return this->m_impl.coeffRef(index); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - m_impl.template writePacket(index, x); + this->m_impl.template writePacket(index, x); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - Scalar* data() const { return NULL; } - - private: - NewDimensions m_dimensions; - TensorEvaluator m_impl; }; @@ -286,7 +263,7 @@ struct TensorEvaluator, Devi }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { for (int i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); @@ -321,24 +298,37 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_impl.evalSubExprsIfNeeded(NULL); + if (data && m_impl.data()) { + Index contiguous_values = 1; + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + if (contiguous_values > 2 * m_device.numThreads()) { + Scalar* src = m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy(data+i, src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } template @@ -376,23 +366,37 @@ struct TensorEvaluator, Devi } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + return inputIndex; + } - private: Dimensions m_dimensions; array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; + const Device& m_device; }; // Eval as lvalue -// TODO(bsteiner): share the code with the evaluator for rvalue slices. template struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; @@ -402,32 +406,8 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) - { - for (int i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } else { - m_inputStrides[0] = 1; - } - } - - const Sizes& output_dims = op.sizes(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } else { - m_outputStrides[0] = 1; - m_fastOutputStrides[0] = 1; - } - } - } + : Base(op, device) + { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -435,71 +415,9 @@ struct TensorEvaluator, Device> typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - CoeffReturnType values[packetSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeffRef(inputIndex); + return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_STRONG_INLINE @@ -509,38 +427,28 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - m_impl.template writePacket(inputIndices[0], x); + this->m_impl.template writePacket(inputIndices[0], x); } else { CoeffReturnType values[packetSize]; internal::pstore(values, x); - m_impl.coeffRef(inputIndices[0]) = values[0]; - m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; for (int i = 1; i < packetSize-1; ++i) { - coeffRef(index+i) = values[i]; + this->coeffRef(index+i) = values[i]; } } } - - Scalar* data() const { return NULL; } - - private: - Dimensions m_dimensions; - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - const StartIndices m_offsets; - TensorEvaluator m_impl; }; -- cgit v1.2.3 From 36fffe48f7231e07915ec231d33cf46faa0fa918 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 23 Aug 2014 14:35:41 -0700 Subject: Misc api improvements and cleanups --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 9 ++++++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 26 +++++++++++++++ unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_morphing.cpp | 37 +++++++++++----------- 4 files changed, 55 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index b9c8c19fe..ef5e11537 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -27,6 +27,10 @@ struct DefaultDevice { EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return 1; + } }; @@ -115,6 +119,11 @@ struct GpuDevice { cudaMemsetAsync(buffer, c, n, *stream_); } + EIGEN_STRONG_INLINE size_t numThreads() const { + // Fixme: + return 32; + } + private: // TODO: multigpu. const cudaStream_t* stream_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3b169a06f..5a113dc19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -195,6 +195,32 @@ struct DSizes : array { } EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } + DSizes& operator = (const array& other) { *static_cast(this) = other; return *this; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 164388746..615ff3e6d 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,7 +110,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") -# ei_add_test(cxx11_tensor_morphing "-std=c++0x") + ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index fbfdaadb7..2a6a97856 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -52,7 +52,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{1, 0}}; + array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -74,7 +74,8 @@ static void test_reshape_as_lvalue() Tensor::Dimensions dim{{2,3,7}}; tensor2d.reshape(dim) = tensor; - Tensor tensor5d(2,3,1,7,1); + float scratch[2*3*1*7*1]; + TensorMap> tensor5d(scratch, 2,3,1,7,1); tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor; for (int i = 0; i < 2; ++i) { @@ -94,14 +95,14 @@ static void test_simple_slice() tensor.setRandom(); Tensor slice1(1,1,1,1,1); - Eigen::DSizes indices(Eigen::array(1,2,3,4,5)); - Eigen::DSizes sizes(Eigen::array(1,1,1,1,1)); + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); slice1 = tensor.slice(indices, sizes); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); Tensor slice2(1,1,2,2,3); - Eigen::DSizes indices2(Eigen::array(1,1,3,4,5)); - Eigen::DSizes sizes2(Eigen::array(1,1,2,2,3)); + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); slice2 = tensor.slice(indices2, sizes2); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 2; ++j) { @@ -124,12 +125,12 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{1, 0}}; + array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; - Eigen::DSizes indices1(Eigen::array(1,2)); - Eigen::DSizes sizes1(Eigen::array(3,3)); - Eigen::DSizes indices2(Eigen::array(0,2)); - Eigen::DSizes sizes2(Eigen::array(3,1)); + Eigen::DSizes indices1(1,2); + Eigen::DSizes sizes1(3,3); + Eigen::DSizes indices2(0,2); + Eigen::DSizes sizes2(3,1); tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along); Map res(tensor3.data(), 3, 1); @@ -153,18 +154,18 @@ static void test_slice_as_lvalue() tensor4.setRandom(); Tensor result(4,5,7); - Eigen::DSizes sizes12(Eigen::array(2,2,7)); - Eigen::DSizes first_slice(Eigen::array(0,0,0)); + Eigen::DSizes sizes12(2,2,7); + Eigen::DSizes first_slice(0,0,0); result.slice(first_slice, sizes12) = tensor1; - Eigen::DSizes second_slice(Eigen::array(2,0,0)); + Eigen::DSizes second_slice(2,0,0); result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2; - Eigen::DSizes sizes3(Eigen::array(4,3,5)); - Eigen::DSizes third_slice(Eigen::array(0,2,0)); + Eigen::DSizes sizes3(4,3,5); + Eigen::DSizes third_slice(0,2,0); result.slice(third_slice, sizes3) = tensor3; - Eigen::DSizes sizes4(Eigen::array(4,3,2)); - Eigen::DSizes fourth_slice(Eigen::array(0,2,5)); + Eigen::DSizes sizes4(4,3,2); + Eigen::DSizes fourth_slice(0,2,5); result.slice(fourth_slice, sizes4) = tensor4; for (int j = 0; j < 2; ++j) { -- cgit v1.2.3 From 2959045f2fe111f93b23517fd6f7afe49720a290 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 26 Aug 2014 09:47:18 -0700 Subject: Optimized the tensor padding code. --- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 95 ++++++++++++++++++---- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 45558d7dd..4482c0992 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,7 +87,7 @@ struct TensorEvaluator, Device enum { IsAligned = false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -100,15 +100,13 @@ struct TensorEvaluator, Device } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -128,7 +126,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { return Scalar(0); @@ -136,21 +134,90 @@ struct TensorEvaluator, Device inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[0].first); return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const int first = index; + const int last = index + packetSize - 1; + const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const int lastPaddedRight = m_outputStrides[i+1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const int lastPaddedLeft = m_padding[0].first; + const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const int lastPaddedRight = m_outputStrides[1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } Scalar* data() const { return NULL; } protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + PaddingDimensions m_padding; Dimensions m_dimensions; - array m_outputStrides; + array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; }; -- cgit v1.2.3 From b24fe22b1a4518f27ca064d496bfdb6c96d973ab Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Sep 2014 11:38:13 -0700 Subject: Improved the performance of the tensor convolution code by a factor of about 4. --- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 139 +++++++++++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 7 +- 2 files changed, 107 insertions(+), 39 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 7d0a21c3b..4a5fd9c79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -226,22 +226,18 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; - } else { - m_inputStride[0] = 1; - } + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; } m_dimensions = m_inputImpl.dimensions(); @@ -251,7 +247,6 @@ struct TensorEvaluator 0) { m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; } else { @@ -260,16 +255,12 @@ struct TensorEvaluator 0) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; - } else { - m_outputStride[0] = 1; - } + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; } } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -278,57 +269,126 @@ struct TensorEvaluator= 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - CoeffReturnType result = CoeffReturnType(0); - convolve(startInput, 0, 0, result); + convolve(firstInput(index), 0, NumKernelDims-1, result); return result; } - /* TODO: vectorization template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - assert(false); - }*/ + const int PacketSize = internal::unpacket_traits::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + startInput += index; + return startInput; + } EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { const Index input = firstIndex + j * m_indexStride[DimIndex]; const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex < NumKernelDims-1) { - convolve(input, kernel, DimIndex+1, accum); + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } - accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); } } } - Scalar* data() const { return NULL; } + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } - private: // No copy, no assignment TensorEvaluator(const TensorEvaluator&); TensorEvaluator& operator = (const TensorEvaluator&); @@ -341,6 +401,11 @@ struct TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index db716a80e..587cbd5ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,8 +108,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { @@ -134,6 +135,8 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + Scalar* data() const { return NULL; } + private: TensorEvaluator m_impl; const Device& m_device; -- cgit v1.2.3 From f50548e86af75fd8e0d1689a9fb4184cf1fec509 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 19:50:27 -0700 Subject: Added missing tensor copy constructors. As a result it is now possible to declare and initialize a tensor on the same line, as in: Tensor T = A + B; or Tensor T(A.reshape(new_shape)); --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index fdbe8df4c..879057f38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -55,7 +55,7 @@ namespace Eigen { * change dramatically. * * - * \ref TopicStorageOrders + * \ref TopicStorageOrders */ template @@ -75,7 +75,7 @@ class Tensor : public TensorBase > enum { IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), - PacketAccess = true, + PacketAccess = (internal::packet_traits::size > 1), }; static const int Options = Options_; @@ -224,12 +224,31 @@ class Tensor : public TensorBase > } #endif - inline Tensor(const array& dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) + inline explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { -- cgit v1.2.3 From d43f737b4ad52e84a3b4d954d9bfb4c40cf9e819 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 20:02:28 -0700 Subject: Added support for evaluation of tensor shuffling operations as lvalues --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 19 ++-- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 104 ++++++++++++++++----- 2 files changed, 96 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index da5148a5b..2da8f8cc8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -222,19 +222,19 @@ class TensorBase return TensorSlicingOp(derived(), startIndices, sizes); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorPaddingOp + const TensorPaddingOp pad(const PaddingDimensions& padding) const { - return TensorPaddingOp(derived(), padding); + return TensorPaddingOp(derived(), padding); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp + const TensorShufflingOp shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + return TensorShufflingOp(derived(), shuffle); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp + const TensorStridingOp stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); + return TensorStridingOp(derived(), strides); } // Force the evaluation of the expression. @@ -244,6 +244,7 @@ class TensorBase } protected: + template friend class Tensor; template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } @@ -258,6 +259,7 @@ class TensorBase : public TensorBase::type PacketReturnType; + template friend class Tensor; template friend class TensorBase; EIGEN_DEVICE_FUNC @@ -293,6 +295,11 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } // Select the device on which to evaluate the expression. template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 4dfc99203..f7e7fc107 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorShufflingOp : public TensorBase, WriteAccessors> +class TensorShufflingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -94,33 +94,38 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shuffle()) + : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shuffle(); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = input_dims[m_shuffle[i]]; + m_dimensions[i] = input_dims[shuffle[i]]; } + array inputStrides; + for (int i = 0; i < NumDims; ++i) { if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; } else { - m_inputStrides[0] = 1; + inputStrides[0] = 1; m_outputStrides[0] = 1; } } + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[shuffle[i]]; + } } - // typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -136,33 +141,90 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[m_shuffle[i]]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[m_shuffle[0]]; - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } Scalar* data() const { return NULL; } protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } + Dimensions m_dimensions; - Shuffle m_shuffle; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; }; +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H -- cgit v1.2.3 From 1abe4ed14c0012d85e833c5f507f282cf26edc36 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 20:27:28 -0700 Subject: Created more regression tests --- test/main.h | 1 + unsupported/test/cxx11_tensor_assign.cpp | 26 +++ unsupported/test/cxx11_tensor_contraction.cpp | 166 +++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 279 ++++++++++++++++++++++---- unsupported/test/cxx11_tensor_shuffling.cpp | 47 +++++ unsupported/test/cxx11_tensor_simple.cpp | 26 +++ 6 files changed, 510 insertions(+), 35 deletions(-) diff --git a/test/main.h b/test/main.h index 3295dcb71..763cec8f9 100644 --- a/test/main.h +++ b/test/main.h @@ -207,6 +207,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b)) +#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b)) #define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b)) #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b)) #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b)) diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index b024bed19..f2b126413 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -228,6 +228,30 @@ static void test_same_type() } } +static void test_auto_resize() +{ + Tensor tensor1; + Tensor tensor2(3); + Tensor tensor3(5); + Tensor tensor4(7); + + Tensor new_tensor(5); + new_tensor.setRandom(); + + tensor1 = tensor2 = tensor3 = tensor4 = new_tensor; + + VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0)); + for (int i = 0; i < new_tensor.dimension(0); ++i) { + VERIFY_IS_EQUAL(tensor1(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor2(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor3(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor4(i), new_tensor(i)); + } +} + void test_cxx11_tensor_assign() { @@ -235,4 +259,6 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); + CALL_SUBTEST(test_auto_resize()); + } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index fc67d500b..a37fcd967 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -141,6 +141,66 @@ static void test_multidims() } +static void test_holes() { + Tensor t1(2, 5, 7, 3); + Tensor t2(2, 7, 11, 13, 3); + t1.setRandom(); + t2.setRandom(); + + Eigen::array dims({{DimPair(0, 0), DimPair(3, 4)}}); + Tensor result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 7); + VERIFY_IS_EQUAL(result.dimension(3), 11); + VERIFY_IS_EQUAL(result.dimension(4), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 5; ++l) { + for (int m = 0; m < 5; ++m) { + VERIFY_IS_APPROX(result(i, j, k, l, m), + t1(0, i, j, 0) * t2(0, k, l, m, 0) + + t1(1, i, j, 0) * t2(1, k, l, m, 0) + + t1(0, i, j, 1) * t2(0, k, l, m, 1) + + t1(1, i, j, 1) * t2(1, k, l, m, 1) + + t1(0, i, j, 2) * t2(0, k, l, m, 2) + + t1(1, i, j, 2) * t2(1, k, l, m, 2)); + } + } + } + } + } +} + + +static void test_full_redux() +{ + Tensor t1(2, 2); + Tensor t2(2, 2, 2); + t1.setRandom(); + t2.setRandom(); + + Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); + Tensor result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) + + t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) + t1(1, 0) * t2(1, 0, 1) + + t1(0, 1) * t2(0, 1, 1) + t1(1, 1) * t2(1, 1, 1)); + + dims[0] = DimPair(1, 0); + dims[1] = DimPair(2, 1); + result = t2.contract(t1, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(0, 1, 0) + + t1(0, 1) * t2(0, 0, 1) + t1(1, 1) * t2(0, 1, 1)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) + t1(1, 0) * t2(1, 1, 0) + + t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1)); +} + + static void test_expr() { Tensor mat1(2, 3); @@ -160,10 +220,116 @@ static void test_expr() } +static void test_out_of_order_contraction() +{ + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2, 2); + + Eigen::array dims({{DimPair(2, 0), DimPair(0, 2)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + + Eigen::array dims2({{DimPair(0, 2), DimPair(2, 0)}}); + mat3 = mat1.contract(mat2, dims2); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + +} + + +static void test_consistency() +{ + // this does something like testing (A*B)^T = (B^T * A^T) + + Tensor mat1(4, 3, 5); + Tensor mat2(3, 2, 1, 5, 4); + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(5, 2, 1, 5); + Tensor mat4(2, 1, 5, 5); + + // contract on dimensions of size 4 and 3 + Eigen::array dims1({{DimPair(0, 4), DimPair(1, 0)}}); + Eigen::array dims2({{DimPair(4, 0), DimPair(0, 1)}}); + + mat3 = mat1.contract(mat2, dims1); + mat4 = mat2.contract(mat1, dims2); + + // check that these are equal except for ordering of dimensions + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + } + } +} + + +static void test_large_contraction() +{ + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + MatrixXf m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_scalar()); CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_full_redux()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_large_contraction()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index caf2e9735..f331cb481 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -22,17 +22,43 @@ using Eigen::RowMajor; // Context for evaluation on cpu struct CPUContext { - CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out) { } + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(Eigen::array(2,2)), kernel_3d_(Eigen::array(2,2,2)) { + kernel_1d_(0) = 3.14f; + kernel_1d_(1) = 2.7f; + + kernel_2d_(Eigen::array(0,0)) = 3.14f; + kernel_2d_(Eigen::array(1,0)) = 2.7f; + kernel_2d_(Eigen::array(0,1)) = 0.2f; + kernel_2d_(Eigen::array(1,1)) = 7.0f; + + kernel_3d_(Eigen::array(0,0,0)) = 3.14f; + kernel_3d_(Eigen::array(0,1,0)) = 2.7f; + kernel_3d_(Eigen::array(0,0,1)) = 0.2f; + kernel_3d_(Eigen::array(0,1,1)) = 7.0f; + kernel_3d_(Eigen::array(1,0,0)) = -1.0f; + kernel_3d_(Eigen::array(1,1,0)) = -0.3f; + kernel_3d_(Eigen::array(1,0,1)) = -0.7f; + kernel_3d_(Eigen::array(1,1,1)) = -0.5f; + } + + const Eigen::DefaultDevice& device() const { return cpu_device_; } const Eigen::Tensor& in1() const { return in1_; } const Eigen::Tensor& in2() const { return in2_; } - Eigen::TensorDevice, Eigen::DefaultDevice> out() { return TensorDevice, Eigen::DefaultDevice>(cpu_device_, out_); } + Eigen::Tensor& out() { return out_; } + const Eigen::Tensor& kernel1d() const { return kernel_1d_; } + const Eigen::Tensor& kernel2d() const { return kernel_2d_; } + const Eigen::Tensor& kernel3d() const { return kernel_3d_; } private: const Eigen::Tensor& in1_; const Eigen::Tensor& in2_; Eigen::Tensor& out_; + Eigen::Tensor kernel_1d_; + Eigen::Tensor kernel_2d_; + Eigen::Tensor kernel_3d_; + Eigen::DefaultDevice cpu_device_; }; @@ -40,19 +66,45 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - cudaStreamCreate(&stream_); + assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + float kernel_1d_val[] = {3.14f, 2.7f}; + assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; + assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; + assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaStreamCreate(&stream_) == cudaSuccess); } ~GPUContext() { - cudaStreamDestroy(stream_); + assert(cudaFree(kernel_1d_) == cudaSuccess); + assert(cudaFree(kernel_2d_) == cudaSuccess); + assert(cudaFree(kernel_3d_) == cudaSuccess); + assert(cudaStreamDestroy(stream_) == cudaSuccess); } + + const Eigen::GpuDevice& device() const { return gpu_device_; } + const Eigen::TensorMap >& in1() const { return in1_; } const Eigen::TensorMap >& in2() const { return in2_; } - Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } + Eigen::TensorMap >& out() { return out_; } + Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } + Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, Eigen::array(2, 2)); } + Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, Eigen::array(2, 2, 2)); } private: const Eigen::TensorMap >& in1_; const Eigen::TensorMap >& in2_; Eigen::TensorMap >& out_; + + float* kernel_1d_; + float* kernel_2d_; + float* kernel_3d_; + cudaStream_t stream_; Eigen::GpuDevice gpu_device_; }; @@ -62,49 +114,151 @@ struct GPUContext { template static void test_contextual_eval(Context* context) { - context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); + context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template static void test_forced_contextual_eval(Context* context) { - context->out() = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); + context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + +template +static void test_contraction(Context* context) +{ + Eigen::array, 2> dims; + dims[0] = std::make_pair(1, 1); + dims[1] = std::make_pair(2, 2); + + Eigen::array shape(40, 50*70); + + Eigen::DSizes indices(0,0); + Eigen::DSizes sizes(40,40); + + context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); +} + + +template +static void test_1d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(40,49,70)); + + Eigen::array dims(1); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); +} + +template +static void test_2d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(40,49,69)); + + Eigen::array dims(1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); +} + +template +static void test_3d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(39,49,69)); + + Eigen::array dims(0,1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); } + static void test_cpu() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); + Eigen::Tensor in1(Eigen::array(40,50,70)); + Eigen::Tensor in2(Eigen::array(40,50,70)); + Eigen::Tensor out(Eigen::array(40,50,70)); - in1.setRandom(); - in2.setRandom(); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); CPUContext context(in1, in2, out); test_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); } } } + + test_contraction(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(Eigen::array(i,j,0)); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f) + + (in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f) + + (in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + + in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } } static void test_gpu() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); - in1.setRandom(); - in2.setRandom(); + Eigen::Tensor in1(Eigen::array(40,50,70)); + Eigen::Tensor in2(Eigen::array(40,50,70)); + Eigen::Tensor out(Eigen::array(40,50,70)); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); std::size_t in1_bytes = in1.size() * sizeof(float); std::size_t in2_bytes = in2.size() * sizeof(float); @@ -120,32 +274,87 @@ static void test_gpu() { cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(2,3,7)); - Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(2,3,7)); - Eigen::TensorMap > gpu_out(d_out, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(40,50,70)); GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); } } } -} + test_contraction(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(Eigen::array(i,j,0)); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f + + in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + + in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + VERIFY_IS_APPROX(expected, result); + } + } + } +} void test_cxx11_tensor_device() diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 92dd01a52..5ab8b6821 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -106,11 +106,58 @@ static void test_expr_shuffling() } } } + + dst_slice_start[0] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[0] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } } +static void test_shuffling_as_value() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array shuffles; + shuffles[2] = 0; + shuffles[3] = 1; + shuffles[1] = 2; + shuffles[0] = 3; + Tensor shuffle(5,7,3,2); + shuffle.shuffle(shuffles) = tensor; + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + void test_cxx11_tensor_shuffling() { CALL_SUBTEST(test_simple_shuffling()); CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_shuffling_as_value()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 1455f2a4c..a70591c82 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -257,12 +257,38 @@ static void test_simple_assign() VERIFY_IS_EQUAL((e2(1,0,2)), -1); } +static void test_resize() +{ + Tensor epsilon; + epsilon.resize(2,3,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 2); + VERIFY_IS_EQUAL(epsilon.dimension(1), 3); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + + const int* old_data = epsilon.data(); + epsilon.resize(3,2,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 2); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + VERIFY_IS_EQUAL(epsilon.data(), old_data); + + epsilon.resize(3,5,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 5); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3ul*5*7); + VERIFY_IS_NOT_EQUAL(epsilon.data(), old_data); +} + void test_cxx11_tensor_simple() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_simple_assign()); + CALL_SUBTEST(test_resize()); } /* -- cgit v1.2.3 From 74db22455ae0172faaae91321da0b303bb82369d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 5 Sep 2014 07:47:43 -0700 Subject: Misc fixes. --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 +++---- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- unsupported/test/cxx11_tensor_padding.cpp | 38 ++++++++++++++++++++-- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d9a6b3f1b..28ae7b3c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -163,7 +163,7 @@ template { return this->m_impl.coeffRef(index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); @@ -314,7 +314,7 @@ struct TensorEvaluator, Devi Scalar* src = m_impl.data(); for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy(data+i, src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); } return false; } @@ -334,7 +334,7 @@ struct TensorEvaluator, Devi template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -355,7 +355,7 @@ struct TensorEvaluator, Devi return rslt; } else { - CoeffReturnType values[packetSize]; + typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); for (int i = 1; i < packetSize-1; ++i) { @@ -420,10 +420,10 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 4482c0992..7da89458f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorPaddingOp : public TensorBase > +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index cb010f512..6f74216dd 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -37,9 +37,42 @@ static void test_simple_padding() for (int k = 0; k < 12; ++k) { for (int l = 0; l < 7; ++l) { if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(tensor(i,j-2,k-3,l), padded(i,j,k,l)); + VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); } else { - VERIFY_IS_EQUAL(0.0f, padded(i,j,k,l)); + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } +} + +static void test_padded_expr() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Eigen::DSizes reshape_dims; + reshape_dims[0] = 12; + reshape_dims[1] = 84; + + Tensor result; + result = tensor.pad(paddings).reshape(reshape_dims); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(result(i+2*j,k+12*l), tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(result(i+2*j,k+12*l), 0.0f); } } } @@ -51,4 +84,5 @@ static void test_simple_padding() void test_cxx11_tensor_padding() { CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_padded_expr()); } -- cgit v1.2.3 From efdff157493826bbcc023a85e08596fd58d7997a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 6 Sep 2014 13:28:24 -0700 Subject: Fixed a typo in the contraction code --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 897d73806..46624724c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } -- cgit v1.2.3 From 1c236f4c9ae78cc58156eebe3b2bb43588897af4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Oct 2014 20:21:42 -0700 Subject: Added tests for tensors of const values and tensors of stringswwq:: --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- unsupported/test/CMakeLists.txt | 2 + unsupported/test/cxx11_tensor_of_const_values.cpp | 105 +++++++++++++++ unsupported/test/cxx11_tensor_of_strings.cpp | 142 +++++++++++++++++++++ 4 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_of_const_values.cpp create mode 100644 unsupported/test/cxx11_tensor_of_strings.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 28ae7b3c6..13109f514 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -301,7 +301,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_impl.evalSubExprsIfNeeded(NULL); - if (data && m_impl.data()) { + if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; for (int i = 0; i < NumDims; ++i) { contiguous_values *= dimensions()[i]; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 615ff3e6d..8d4e7db66 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,6 +106,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp new file mode 100644 index 000000000..f179a0c21 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_const_values.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_assign() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + const TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor rslt1; + rslt1 = mat1; + Tensor rslt2; + rslt2 = mat2; + + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; + + Tensor rslt5(mat1); + Tensor rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(rslt1(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt2(i,j), static_cast(-i - 2*j)); + VERIFY_IS_APPROX(rslt3(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt4(i,j), static_cast(-i - 2*j)); + VERIFY_IS_APPROX(rslt5(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt6(i,j), static_cast(-i - 2*j)); + } + } +} + + +static void test_plus() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor sum1; + sum1 = mat1 + mat2; + Tensor sum2; + sum2 = mat2 + mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(sum1(i,j), 0.0f); + VERIFY_IS_APPROX(sum2(i,j), 0.0f); + } + } +} + + +static void test_plus_equal() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + mat2 += mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(mat2(i,j), 0.0f); + } + } +} + + +void test_cxx11_tensor_of_const_values() +{ + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_plus()); + CALL_SUBTEST(test_plus_equal()); +} diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp new file mode 100644 index 000000000..0ffa341c4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include + +using std::string; +using Eigen::Tensor; +using Eigen::TensorMap; + +static void test_assign() +{ + string data1[6]; + TensorMap> mat1(data1, 2, 3); + string data2[6]; + const TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + std::ostringstream s1; + s1 << "abc" << i*3; + data1[i] = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5; + data2[i] = s2.str(); + } + + Tensor rslt1; + rslt1 = mat1; + Tensor rslt2; + rslt2 = mat2; + + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; + + Tensor rslt5(mat1); + Tensor rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]); + } + } +} + + +static void test_concat() +{ + Tensor t1(2, 3); + Tensor t2(2, 3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + t1(i, j) = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5 + j*32; + t2(i, j) = s2.str(); + } + } + + Tensor result = t1.concatenate(t2, 1); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 6); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(result(i, j), t1(i, j)); + VERIFY_IS_EQUAL(result(i, j+3), t2(i, j)); + } + } +} + + +static void test_slices() +{ + Tensor data(2, 6); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + data(i, j) = s1.str(); + } + } + + const Eigen::DSizes half_size{{2, 3}}; + const Eigen::DSizes first_half{{0, 0}}; + const Eigen::DSizes second_half{{0, 3}}; + + Tensor t1 = data.slice(first_half, half_size); + Tensor t2 = data.slice(second_half, half_size); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(data(i, j), t1(i, j)); + VERIFY_IS_EQUAL(data(i, j+3), t2(i, j)); + } + } +} + + +static void test_additions() +{ + Tensor data1(3); + Tensor data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = "abc"; + std::ostringstream s1; + s1 << i; + data2(i) = s1.str(); + } + + Tensor sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + std::ostringstream concat; + concat << "abc" << i; + string expected = concat.str(); + VERIFY_IS_EQUAL(sum(i), expected); + } +} + + +void test_cxx11_tensor_of_strings() +{ + // Beware: none of this is likely to ever work on a GPU. + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_concat()); + CALL_SUBTEST(test_slices()); + CALL_SUBTEST(test_additions()); +} -- cgit v1.2.3 From 7caaf6453b7b1f58d953729380d596b2d9b27835 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Oct 2014 20:38:22 -0700 Subject: Added support for tensor reductions and concatenations --- unsupported/Eigen/CXX11/Tensor | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 28 +++ .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 217 ++++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 62 ++++++ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 226 +++++++++++++++++++++ unsupported/test/CMakeLists.txt | 4 +- unsupported/test/cxx11_tensor_concatenation.cpp | 110 ++++++++++ unsupported/test/cxx11_tensor_reduction.cpp | 147 ++++++++++++++ 9 files changed, 798 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h create mode 100644 unsupported/test/cxx11_tensor_concatenation.cpp create mode 100644 unsupported/test/cxx11_tensor_reduction.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index ebe6419e8..11161a547 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -34,12 +34,15 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 2da8f8cc8..2f7c9ecda 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,12 +204,40 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp broadcast(const Broadcast& broadcast) const { return TensorBroadcastingOp(derived(), broadcast); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 000000000..b8e43f484 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,217 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Packet Packet; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int RightNumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1]; + m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array subs; + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + protected: + const Axis m_axis; + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index afbcc9486..bc67586a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,8 +21,9 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorReductionOp; template class TensorBroadcastingOp; +template class TensorReductionOp; +template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 000000000..92984336c --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,62 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + +// Standard reduction functors +template struct SumReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SumReducer() : m_sum(0) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + m_sum += t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_sum; + } + + private: + T m_sum; +}; + +template struct MaxReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max((std::numeric_limits::min)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + if (t > m_max) { m_max = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_max; + } + + private: + T m_max; +}; + +template struct MinReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MinReducer() : m_min((std::numeric_limits::max)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + if (t < m_min) { m_min = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_min; + } + + private: + T m_min; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 000000000..eef992106 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReductionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReductionOp type; +}; + +} // end namespace internal + + +template +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = false, // The code isn't vectorized properly yet + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()) + { + EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + + array reduced; + for (int i = 0; i < NumInputDims; ++i) { + reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedDims[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + m_dimensions[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + + array strides; + strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + strides[i] = strides[i-1] * input_dims[i-1]; + } + outputIndex = 0; + reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedStrides[reduceIndex] = strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = strides[i]; + ++outputIndex; + } + } + + // Special case for full reductions + if (NumInputDims == NumReducedDims) { + m_dimensions[0] = 1; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Op reducer(m_reducer); + reduce(firstInput(index), 0, reducer); + return reducer.finalize(); + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + return startInput; + } + + EIGEN_DEVICE_FUNC void reduce(Index firstIndex, int DimIndex, Op& reducer) const { + for (int j = 0; j < m_reducedDims[DimIndex]; ++j) { + const Index input = firstIndex + j * m_reducedStrides[DimIndex]; + if (DimIndex < NumReducedDims-1) { + reduce(input, DimIndex+1, reducer); + } else { + reducer.reduce(m_impl.coeff(input)); + } + } + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_preservedStrides; + array m_reducedStrides; + array m_reducedDims; + Op m_reducer; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 8d4e7db66..e83d8b54e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,14 +106,16 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") +# ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") + ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_reduction "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp new file mode 100644 index 000000000..8fd4f5f80 --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_dimension_failures() +{ + Tensor left(2, 3, 1); + Tensor right(3, 3, 1); + left.setRandom(); + right.setRandom(); + + // Okay; other dimensions are equal. + Tensor concatenation = left.concatenate(right, 0); + + // Dimension mismatches. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2)); + + // Axis > NumDims or < 0. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1)); +} + +static void test_static_dimension_failure() +{ + Tensor left(2, 3); + Tensor right(2, 3, 1); + +#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE + // Technically compatible, but we static assert that the inputs have same + // NumDims. + Tensor concatenation = left.concatenate(right, 0); +#endif + + // This can be worked around in this case. + Tensor concatenation = left + .reshape(Tensor::Dimensions{{2, 3, 1}}) + .concatenate(right, 0); + Tensor alternative = left + .concatenate(right.reshape(Tensor::Dimensions{{2, 3}}), 0); +} + +static void test_simple_concatenation() +{ + Tensor left(2, 3, 1); + Tensor right(2, 3, 1); + left.setRandom(); + right.setRandom(); + + Tensor concatenation = left.concatenate(right, 0); + VERIFY_IS_EQUAL(concatenation.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0)); + } + } + + concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0)); + } + } + + concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0)); + } + } +} + + +// TODO(phli): Add test once we have a real vectorized implementation. +// static void test_vectorized_concatenation() {} + + +void test_cxx11_tensor_concatenation() +{ + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_simple_concatenation()); + // CALL_SUBTEST(test_vectorized_concatenation()); +} diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp new file mode 100644 index 000000000..27135b982 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include + +using Eigen::Tensor; + +static void test_simple_reductions() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), sum); + } + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float max_val = std::numeric_limits::lowest(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + max_val = (std::max)(max_val, tensor(k, i, l, j)); + } + } + VERIFY_IS_APPROX(result(i, j), max_val); + } + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float min_val = (std::numeric_limits::max)(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + min_val = (std::min)(min_val, tensor(k, l, i, j)); + } + } + VERIFY_IS_APPROX(result(i, j), min_val); + } + } +} + + +static void test_full_reductions() +{ + Tensor tensor(2,3); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + + Tensor result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sum); + + result = tensor.square().sum(reduction_axis).sqrt(); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j) * tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sqrtf(sum)); +} + + +struct UserReducer { + UserReducer(float offset) : offset_(offset), sum_(0.0f) {} + void reduce(const float val) { + sum_ += val * val; + } + float finalize() const { + return 1.0f / (sum_ + offset_); + } + + private: + float offset_; + float sum_; +}; + +static void test_user_defined_reductions() +{ + Tensor tensor(5,7); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + + UserReducer reducer(10.0f); + Tensor result = tensor.reduce(reduction_axis, reducer); + VERIFY_IS_EQUAL(result.dimension(0), 5); + for (int i = 0; i < 5; ++i) { + float expected = 10.0f; + for (int j = 0; j < 7; ++j) { + expected += tensor(i, j) * tensor(i, j); + } + expected = 1.0f / expected; + VERIFY_IS_APPROX(result(i), expected); + } +} + + +void test_cxx11_tensor_reduction() +{ + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); +} -- cgit v1.2.3 From 5cc23199be743d0d1be85d709eb366e67e87a262 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 10:30:44 -0700 Subject: More tests to validate the const-correctness of the tensor code. --- Eigen/src/Core/GenericPacketMath.h | 2 ++ Eigen/src/Core/util/XprHelper.h | 8 +++++++ unsupported/test/CMakeLists.txt | 3 ++- unsupported/test/cxx11_tensor_const.cpp | 39 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_const.cpp diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6ec29d0fd..e6fea5bba 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -95,6 +95,8 @@ template struct packet_traits : default_packet_traits }; }; +template struct packet_traits : packet_traits { }; + /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 7c77b2263..67ca49754 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -415,6 +415,14 @@ template struct promote_storage_type { typedef A ret; }; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct promote_storage_type +{ + typedef A ret; +}; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e83d8b54e..a47c7bc74 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,7 +106,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") -# ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_const "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp new file mode 100644 index 000000000..0ffb02afd --- /dev/null +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +using Eigen::Tensor; + + + + +static void test_simple_assign() +{ + Tensor random(2,3,7); + random.setRandom(); + + TensorMap > constant(random.data(), 2, 3, 7); + Tensor result(2,3,7); + result = constant; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k)); + } + } + } +} + +void test_cxx11_tensor_const() +{ + CALL_SUBTEST(test_simple_assign()); +} -- cgit v1.2.3 From 8b2afe33a165ff0cc5a7afd14fcfb06cdf703235 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 10:39:36 -0700 Subject: Fixes for the forced evaluation of tensor expressions More tests --- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 13 +++--- unsupported/test/CMakeLists.txt | 3 +- unsupported/test/cxx11_tensor_dimension.cpp | 51 ++++++++++++++++++++++ unsupported/test/cxx11_tensor_forced_eval.cpp | 51 ++++++++++++++++++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_dimension.cpp create mode 100644 unsupported/test/cxx11_tensor_forced_eval.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 6f6641de6..cb14cc7f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -87,31 +87,28 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = true, + PacketAccess = (internal::packet_traits::size > 1), }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - EIGEN_DEVICE_FUNC ~TensorEvaluator() { - eigen_assert(!m_buffer); - } - typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); typedef TensorEvalToOp EvalTo; EvalTo evalToTmp(m_buffer, m_op); internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); m_impl.cleanup(); + return true; } EIGEN_STRONG_INLINE void cleanup() { m_device.deallocate(m_buffer); @@ -129,6 +126,8 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + Scalar* data() const { return m_buffer; } + private: TensorEvaluator m_impl; const ArgType m_op; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a47c7bc74..5d8913dd8 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -101,10 +101,12 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") # ei_add_test(cxx11_tensor_assign "-std=c++0x") +# ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") + ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") @@ -120,6 +122,5 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp new file mode 100644 index 000000000..fc0d29c50 --- /dev/null +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_dynamic_size() +{ + Eigen::DSizes dimensions(Eigen::array(2,3,7)); + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); +} + +static void test_fixed_size() +{ + Eigen::Sizes<2,3,7> dimensions; + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); +} + + +static void test_match() +{ + Eigen::DSizes dyn(Eigen::array(2,3,7)); + Eigen::Sizes<2,3,7> stat; + VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); +} + + +void test_cxx11_tensor_dimension() +{ + CALL_SUBTEST(test_dynamic_size()); + CALL_SUBTEST(test_fixed_size()); + CALL_SUBTEST(test_match()); +} diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp new file mode 100644 index 000000000..529584a7b --- /dev/null +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +#include + +using Eigen::MatrixXf; +using Eigen::Tensor; + +static void test_simple() +{ + MatrixXf m1(3,3); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + TensorMap> mat1(m1.data(), 3,3); + TensorMap> mat2(m2.data(), 3,3); + + Tensor mat3(3,3); + mat3 = mat1; + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 0)}}); + + mat3 = mat3.contract(mat2, dims).eval(); + + VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0)); + VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1)); + VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2)); + VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0)); + VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1)); + VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2)); + VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0)); + VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1)); + VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2)); +} + + +void test_cxx11_tensor_forced_eval() +{ + CALL_SUBTEST(test_simple()); +} -- cgit v1.2.3 From b7271dffb5b1ceeee4c8bd99402ff89dcce58d74 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 16:51:57 -0700 Subject: Generalized the gebp apis --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 423 +++++++++++---------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 80 ++-- .../Core/products/GeneralMatrixMatrixTriangular.h | 54 +-- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 51 ++- Eigen/src/Core/products/TriangularMatrixMatrix.h | 65 ++-- Eigen/src/Core/products/TriangularSolverMatrix.h | 49 ++- Eigen/src/Core/util/BlasUtil.h | 106 +++++- unsupported/test/CMakeLists.txt | 2 +- 8 files changed, 473 insertions(+), 357 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7da52c2e8..090c8f4e6 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -667,7 +667,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template +template struct gebp_kernel { typedef gebp_traits Traits; @@ -676,14 +676,15 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - + typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - + + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, @@ -693,14 +694,16 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template +template EIGEN_DONT_INLINE -void gebp_kernel - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, +void gebp_kernel + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -743,15 +746,15 @@ void gebp_kernel traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -813,48 +816,48 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r1+0*Traits::ResPacketSize); - R1 = ploadu(r1+1*Traits::ResPacketSize); - R2 = ploadu(r1+2*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); - pstoreu(r1+0*Traits::ResPacketSize, R0); - pstoreu(r1+1*Traits::ResPacketSize, R1); - pstoreu(r1+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r2+2*Traits::ResPacketSize); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r2+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r3+0*Traits::ResPacketSize); - R1 = ploadu(r3+1*Traits::ResPacketSize); - R2 = ploadu(r3+2*Traits::ResPacketSize); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); - pstoreu(r3+0*Traits::ResPacketSize, R0); - pstoreu(r3+1*Traits::ResPacketSize, R1); - pstoreu(r3+2*Traits::ResPacketSize, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C4); traits.initAcc(C8); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -912,19 +916,19 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - traits.acc(C8 , alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); } } } - + //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { @@ -946,15 +950,15 @@ void gebp_kernel traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -978,7 +982,7 @@ void gebp_kernel traits.madd(A1, B2, C6, B2); \ traits.madd(A0, B3, C3, T0); \ traits.madd(A1, B3, C7, B3) - + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1002,37 +1006,37 @@ void gebp_kernel blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1, R2, R3; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r1+0*Traits::ResPacketSize); - R3 = ploadu(r1+1*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r1+0*Traits::ResPacketSize, R2); - pstoreu(r1+1*Traits::ResPacketSize, R3); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r3+0*Traits::ResPacketSize); - R3 = ploadu(r3+1*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r3+0*Traits::ResPacketSize, R2); - pstoreu(r3+1*Traits::ResPacketSize, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C0); traits.initAcc(C4); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - internal::prefetch(r0+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1089,12 +1093,12 @@ void gebp_kernel ResPacket R0, R1; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); } } } @@ -1120,15 +1124,15 @@ void gebp_kernel traits.initAcc(C2); traits.initAcc(C3); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1171,25 +1175,25 @@ void gebp_kernel blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r1+0*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r1+0*Traits::ResPacketSize, R1); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r3+0*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r3+0*Traits::ResPacketSize, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 AccPacket C0; traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1241,9 +1245,9 @@ void gebp_kernel #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0+0*Traits::ResPacketSize, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); } } } @@ -1259,7 +1263,7 @@ void gebp_kernel const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - + if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers @@ -1268,32 +1272,32 @@ void gebp_kernel straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); - + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); - + Index k=0; for(; k { SLhsPacket A0; SRhsPacket B_0; - + straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); - + blB += SwappedTraits::LhsProgress; blA += spk; } @@ -1317,10 +1321,10 @@ void gebp_kernel typedef typename conditional::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional::half,SAccPacket>::type SAccPacketHalf; - - SResPacketHalf R = pgather(&res[j2*resStride + i], resStride); + + SResPacketHalf R = res.template gatherPacket(i, j2); SResPacketHalf alphav = pset1(alpha); - + if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet @@ -1336,14 +1340,14 @@ void gebp_kernel { straits.acc(predux4(C0), alphav, R); } - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } else { - SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket R = res.template gatherPacket(i, j2); SResPacket alphav = pset1(alpha); straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } } else // scalar path @@ -1355,25 +1359,25 @@ void gebp_kernel { LhsScalar A0; RhsScalar B_0, B_1; - + A0 = blA[k]; - + B_0 = blB[0]; B_1 = blB[1]; MADD(cj,A0,B_0,C0, B_0); MADD(cj,A0,B_1,C1, B_1); - + B_0 = blB[2]; B_1 = blB[3]; MADD(cj,A0,B_0,C2, B_0); MADD(cj,A0,B_1,C3, B_1); - + blB += 4; } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - res[(j2+2)*resStride + i] += alpha*C2; - res[(j2+3)*resStride + i] += alpha*C3; + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; } } } @@ -1394,7 +1398,7 @@ void gebp_kernel RhsScalar B_0 = blB[k]; MADD(cj, A0, B_0, C0, B_0); } - res[(j2+0)*resStride + i] += alpha*C0; + res(i, j2) += alpha * C0; } } } @@ -1417,15 +1421,16 @@ void gebp_kernel // // 32 33 34 35 ... // 36 36 38 39 ... -template -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1436,30 +1441,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; - + Index i=0; - + // Pack 3 packets if(Pack1>=3*PacketSize) { for(; i(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); - C = ploadu(&lhs(i+2*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1473,12 +1477,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1491,11 +1495,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1508,11 +1512,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1543,13 +1548,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - + int pack = Pack1; Index i = 0; while(pack>0) @@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1594,15 +1598,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1685,27 +1690,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) { for(Index j2=packet_cols8; j2 kernel; - kernel.packet[0] = ploadu(&b0[k]); - kernel.packet[1] = ploadu(&b1[k]); - kernel.packet[2] = ploadu(&b2[k]); - kernel.packet[3] = ploadu(&b3[k]); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); @@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1762,7 +1768,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - + // if(nr>=8) // { // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); count += 4; } } @@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs::ReturnType ResScal static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth, // In order to reduce the chance that a thread has to wait for the other, // let's start by packing B'. - pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); // Pack A_k to A' in a parallel fashion: // each thread packs the sub block A_k,i to A'_i where i is the thread id. @@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth, // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - - pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; @@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth, // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[i].sync!=k) {} - gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } // Then keep going as usual with the remaining B' @@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth, const Index actual_nc = (std::min)(j+nc,cols)-j; // pack B_k,j to B' - pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); // C_j += A' * B' - gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } // Release all the sub blocks A'_i of A' for the current thread, @@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - for(Index k2=0; k2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) - // Note that this panel will be read as many times as the number of blocks in the rhs's - // horizontal panel which is, in practice, a very low number. - pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); + const Index actual_mc = (std::min)(i2+mc,rows)-i2; - // For each kc x nc block of the rhs's horizontal panel... - for(Index j2=0; j2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2m_nc; computeProductBlockingSizes(this->m_kc, this->m_mc, n); } - + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 225b994d1..daa8a1d8a 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction @@ -75,10 +79,10 @@ struct general_matrix_matrix_triangular_product pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; tribb_kernel sybb; for(Index k2=0; k2 processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -129,13 +134,16 @@ struct tribb_kernel { typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; - + enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + ResMapper res(_res, resStride); + gebp_kernel gebp_kernel; + Matrix buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -146,7 +154,7 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0); // selfadjoint micro block @@ -154,12 +162,12 @@ struct tribb_kernel Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper LhsTransposeMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; + gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_rhs pack_rhs; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2 transposed packed copy @@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix lhs(_lhs,lhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2& blocking) { // strip zeros @@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2 skip it @@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { // strip zeros @@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; @@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { Index cols = otherSize; - const_blas_data_mapper tri(_tri,triStride); - blas_data_mapper other(_other,otherStride); + + typedef const_blas_data_mapper TriMapper; + typedef blas_data_mapper OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements @@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix& blocking) { Index rows = otherSize; - const_blas_data_mapper rhs(_tri,triStride); - blas_data_mapper lhs(_other,otherStride); + + typedef blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; enum { @@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), @@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 0d8e2705a..25a62d528 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -18,13 +18,13 @@ namespace Eigen { namespace internal { // forward declarations -template +template struct gebp_kernel; -template +template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -117,32 +117,96 @@ template struct get_factor::R static EIGEN_STRONG_INLINE typename NumTraits::Real run(const Scalar& x) { return numext::real(x); } }; + +template +class MatrixLinearMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + pstoret(m_data + i, p); + } + + protected: + Scalar *m_data; +}; + // Lightweight helper class to access matrix coefficients. -// Yes, this is somehow redundant with Map<>, but this version is much much lighter, -// and so I hope better compilation performance (time and code quality). -template -class blas_data_mapper -{ +template +class blas_data_mapper { public: - blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j) - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef MatrixLinearMapper LinearMapper; + + EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + + EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride); + } + + EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + template + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + protected: - Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; }; // lightweight helper class to access matrix coefficients (const version) template -class const_blas_data_mapper -{ +class const_blas_data_mapper : public blas_data_mapper { public: - const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - protected: - const Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper(data, stride) {} + + EIGEN_ALWAYS_INLINE const_blas_data_mapper getSubMapper(Index i, Index j) const { + return const_blas_data_mapper(&(this->operator()(i, j)), this->m_stride); + } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5d8913dd8..75423f516 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -48,7 +48,7 @@ if(MPFR_FOUND) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) +# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() -- cgit v1.2.3 From 12693928228922ecf8fa3fcf14341d195e376a11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 10:16:59 -0700 Subject: Created the IndexPair type to store pair of tensor indices. CUDA doesn't support std::pair so we can't use them when targeting GPUs. Improved the performance on tensor contractions --- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 729 ++++++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 7 + 3 files changed, 656 insertions(+), 84 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 3812ecd1f..227522ecb 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -69,11 +69,13 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK template struct array_size; +template struct array_size > { + static const size_t value = N; +}; template struct array_size > { static const size_t value = N; }; - /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 46624724c..1e6f276e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,6 +20,319 @@ namespace Eigen { * */ namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template +class BaseTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(int i) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair(linidx[0], linidx[1]); + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + + +template +class TensorContractionInputMapper; + +template +class TensorContractionSubMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template::size : 1), + bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_DEFAULT Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_DEFAULT Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload::type>(data); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + template struct traits > { @@ -53,6 +366,14 @@ struct nested, 1, typena typedef TensorContractionOp type; }; +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; +}; + } // end namespace internal @@ -102,143 +423,385 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator, Device> +template +struct TensorContractionEvaluatorBase { + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::Device Device; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - typedef typename XprType::Index Index; + typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), }; - TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) { - Index index = 0; - Index stride = 1; - m_shiftright = 1; + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); + + array::Dimensions::count> lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + } + + array::Dimensions::count> rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + } - int skipped = 0; + m_i_strides[0] = 1; + m_j_strides[0] = 1; + m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + int nocontract_idx = 0; const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].first == i) { - skip = true; - m_leftOffsets[2*skipped] = stride; - m_leftOffsets[2*skipped+1] = stride * left_dims[i]; - m_stitchsize[skipped] = left_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = left_dims[i]; - m_shiftright *= left_dims[i]; - } else { - ++skipped; + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + } + dim_idx++; + nocontract_idx++; } - stride *= left_dims[i]; } - stride = 1; - skipped = 0; + nocontract_idx = 0; const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].second == i) { - skip = true; - m_rightOffsets[2*skipped] = stride; - m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = right_dims[i]; + if (!contracting) { + m_dimensions[dim_idx] = right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < internal::array_size::value; i++) { + Index left = op.indices()[i].first; + Index right = op.indices()[i].second; + + Index size = left_dims[left]; + eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + + if (i+1 < internal::array_size::value) { + m_k_strides[i+1] = m_k_strides[i] * size; } else { - ++skipped; + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < op.indices()[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; } - stride *= right_dims[i]; } - // Scalar case + // Scalar case. We represent the result as a 1d tensor of size 1. if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; - - const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - void evalTo(typename XprType::Scalar* buffer) const { - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - const Index startLeft = index % m_shiftright; - const Index startRight = index / m_shiftright; - CoeffReturnType result = CoeffReturnType(0); - partialStitch(startLeft, startRight, 0, result); - return result; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; } - /* TODO: vectorization template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - assert(false); - }*/ - - private: - EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { - Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); - Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); - - for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { - const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; - const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; - if (StitchIndex < internal::array_size::value-1) { - partialStitch(left, right, StitchIndex+1, accum); - } else { - accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); - } - } + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - private: - array::value> m_leftOffsets; - array::value> m_rightOffsets; - array::value> m_stitchsize; - Index m_shiftright; Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + const Device& m_device; + Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; }; +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + typedef typename internal::gemm_blocking_space BlockingType; + + // Sizes of the blocks to load in cache. See the Goto paper for details. + BlockingType blocking(m, n, k, true); + const Index kc = blocking.kc(); + const Index mc = (std::min)(m, blocking.mc()); + const Index nc = (std::min)(n, blocking.nc()); + int sizeA = mc * kc; + int sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } +}; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 5a113dc19..11590b474 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,6 +29,13 @@ namespace Eigen { * \sa Tensor */ +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } + Index first; + Index second; +}; // Boiler plate code -- cgit v1.2.3 From af2e5995e2ba48384024bbc8432bd6dbbebf71d2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:18:07 -0700 Subject: Improved support for CUDA devices. Improved contractions on GPU --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 1206 ++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 38 +- 3 files changed, 1237 insertions(+), 8 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 11161a547..b1bd2f676 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -44,6 +44,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 000000000..babe33fff --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1206 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = Scalar(0); \ + lhs_pf1 = Scalar(0); \ + lhs_pf2 = Scalar(0); \ + lhs_pf3 = Scalar(0); \ + lhs_pf4 = Scalar(0); \ + lhs_pf5 = Scalar(0); \ + lhs_pf6 = Scalar(0); \ + lhs_pf7 = Scalar(0); \ + \ + rhs_pf0 = Scalar(0); \ + rhs_pf1 = Scalar(0); \ + rhs_pf2 = Scalar(0); \ + rhs_pf3 = Scalar(0); \ + rhs_pf4 = Scalar(0); \ + rhs_pf5 = Scalar(0); \ + rhs_pf6 = Scalar(0); \ + rhs_pf7 = Scalar(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res + } + + + template +__global__ void +__launch_bounds__(512) + EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } + } + + + + template +__device__ EIGEN_STRONG_INLINE void + EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float4* lhs_shmem4, float2* rhs_shmem2, + const Index m_size, const Index n_size, const Index k_size) { + typedef float Scalar; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + const Index lane = threadIdx.x + 8 * (threadIdx.y % 4); + + // prefetch registers + float4 lhs_pf0; + float4 lhs_pf1; + + float4 rhs_pf0; + float4 rhs_pf1; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // all of these indices assume float4 loading + // this thread loads the float4 starting at this index, and then also loads + // another float4 starting 32 columns to to the right + const Index horiz_block_idx = threadIdx.z / 2; + const Index vert_block_idx = threadIdx.x / 2 + 4 * (threadIdx.y % 2); + const Index horiz_idx_in_block = threadIdx.y / 2 + 4 * (threadIdx.z % 2); + const Index vert_idx_in_block = threadIdx.x % 2; + + // there's padding in both the LHS and RHS shared memory layouts. This padding + // allows for 0 bank conflicts on all shmem stores and loads. + // LHS padding: 1 float4 on each 8x8 block of floats + // RHS padding: 1 float2 on each block, and 12 additional float2s between vertical blocks + // 3 and 4 + + // storage indices + // lhs index with respect to float4s + const Index lhs_store_idx_base = + 136 * horiz_block_idx + + 17 * vert_block_idx + + 8 * vert_idx_in_block + + horiz_idx_in_block; + + // rhs index with respect to floats + const Index rhs_store_idx_base = + 552 * horiz_block_idx + + 66 * vert_block_idx + + 32 * (horiz_idx_in_block / 4) + (horiz_idx_in_block % 4) + + 16 * vert_idx_in_block + + ((vert_block_idx < 4) ? 0 : 24); + + const Index lhs_store_idx_0 = lhs_store_idx_base + 544 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 544 * 1; + + const Index rhs_store_idx_0 = (rhs_store_idx_base / 2) + ((lane < 16) ? 0 : 4); + const Index rhs_store_idx_1 = rhs_store_idx_0 + 2; + const Index rhs_store_idx_2 = rhs_store_idx_0 + 1104; + const Index rhs_store_idx_3 = rhs_store_idx_1 + 1104; + + // The below diagrams show which shmem index (with respect to floats) each element + // in an 8x8 input block gets packed into: + // LHS: + // 0 4 8 12 16 20 24 28 + // 1 5 9 13 17 21 25 29 + // 2 6 10 14 18 22 26 30 + // 3 7 11 15 19 23 27 31 + // 32 36 40 44 48 52 56 60 + // ... (pack as 2 rows of float4 indexed row major, each float4 is vertical) + // + // RHS: + // 0 1 2 3 32 33 34 35 + // 4 5 6 7 36 37 38 39 + // ... (pack as 2 cols of float4 indexed col major, each float4 is horizontal) + + // Each thread in a warp loads 2 float4s. This happens in 2 instructions. On each of these + // instruction, the warp loads 2 columns (2 cols * 64 elements / col = 128 elements = 32 threads + // * 4 elements/thread). For the LHS, we're able to store the loaded float4 directly into + // shmem (using a 128 bit store instruction). For the RHS, we need to transpose the data. + // This is done with warp shuffles. Furthermore, we only use 64 bit stores for the RHS, because + // 64 bits is only 2 columns (which is all we load in a warp), and the padding for the RHS + // doesn't meet 64 bit alignment requirements (namely, the 4 consecutive floats that we want + // to load on the RHS are 8 byte aligned, not 16 byte aligned, which is required for float4). + + const Index load_idx_vert = 4 * (threadIdx.x + 8 * (threadIdx.y % 2)); + const Index load_idx_horiz = (threadIdx.y / 2) + 4 * threadIdx.z; + + const Index lhs_vert = base_m + load_idx_vert; + const Index rhs_horiz_0 = base_n + load_idx_horiz; + const Index rhs_horiz_1 = base_n + load_idx_horiz + 32; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = internal::pset1(0); \ + lhs_pf1 = internal::pset1(0); \ + \ + rhs_pf0 = internal::pset1(0); \ + rhs_pf1 = internal::pset1(0); \ + \ + const Index lhs_horiz_0 = base_k + load_idx_horiz; \ + const Index lhs_horiz_1 = base_k + load_idx_horiz + 32; \ + if (!needs_edge_check || lhs_vert + 3 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs.loadPacket(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ + } \ + } else if (lhs_vert + 2 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ + \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ + lhs_pf1.z = lhs(lhs_vert + 2, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ + } \ + } else if (lhs_vert + 1 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + } \ + } else if (lhs_vert < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + } \ +} \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (rhs_vert + 3 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ + } \ + } else if (rhs_vert + 2 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ + \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ + } \ + } else if (rhs_vert + 1 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + } \ + } else if (rhs_vert < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + } \ +} \ + \ + float swap_val0 = (lane < 16) ? rhs_pf0.z : rhs_pf0.x; \ + float swap_val1 = (lane < 16) ? rhs_pf0.w : rhs_pf0.y; \ + float swap_val2 = (lane < 16) ? rhs_pf1.z : rhs_pf1.x; \ + float swap_val3 = (lane < 16) ? rhs_pf1.w : rhs_pf1.y; \ + \ + swap_val0 = __shfl_xor(swap_val0, 16); \ + swap_val1 = __shfl_xor(swap_val1, 16); \ + swap_val2 = __shfl_xor(swap_val2, 16); \ + swap_val3 = __shfl_xor(swap_val3, 16); \ + \ + if (lane < 16) { \ + rhs_pf0.z = swap_val0; \ + rhs_pf0.w = swap_val1; \ + rhs_pf1.z = swap_val2; \ + rhs_pf1.w = swap_val3; \ + } else { \ + rhs_pf0.x = swap_val0; \ + rhs_pf0.y = swap_val1; \ + rhs_pf1.x = swap_val2; \ + rhs_pf1.y = swap_val3; \ + } \ +} \ + + +#define writeRegToShmem(_) \ + lhs_shmem4[lhs_store_idx_0] = lhs_pf0; \ + \ + rhs_shmem2[rhs_store_idx_0] = make_float2(rhs_pf0.x, rhs_pf0.z); \ + rhs_shmem2[rhs_store_idx_1] = make_float2(rhs_pf0.y, rhs_pf0.w); \ + \ + lhs_shmem4[lhs_store_idx_1] = lhs_pf1; \ + \ + rhs_shmem2[rhs_store_idx_2] = make_float2(rhs_pf1.x, rhs_pf1.z); \ + rhs_shmem2[rhs_store_idx_3] = make_float2(rhs_pf1.y, rhs_pf1.w); \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + +#undef prefetchIntoRegisters +#undef writeRegoToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + + float4 _lcol0; + float4 _lcol1; + float2 _rrow0; + float2 _rrow1; + float2 _rrow2; + float2 _rrow3; + +#define lcol0 _lcol0.x +#define lcol1 _lcol0.y +#define lcol2 _lcol0.z +#define lcol3 _lcol0.w +#define lcol4 _lcol1.x +#define lcol5 _lcol1.y +#define lcol6 _lcol1.z +#define lcol7 _lcol1.w +#define rrow0 _rrow0.x +#define rrow1 _rrow0.y +#define rrow2 _rrow1.x +#define rrow3 _rrow1.y +#define rrow4 _rrow2.x +#define rrow5 _rrow2.y +#define rrow6 _rrow3.x +#define rrow7 _rrow3.y + + // Now x corresponds to k, y to m, and z to n + const float4* lhs_block = &lhs_shmem4[threadIdx.x + 8 * (threadIdx.y % 2) + 17 * (threadIdx.y / 2)]; + const float2* rhs_block = &rhs_shmem2[2 * threadIdx.x + 16 * (threadIdx.z % 2) + 276 * (threadIdx.z / 2)]; + +#define lhs_element(i, k) lhs_block[68 * i + 136 * k] +#define rhs_element(k, j) rhs_block[33 * k + 1104 * j + ((k < 4) ? 0 : 12)] + +#define loadData(i) \ + _lcol0 = lhs_element(0, i); \ + _rrow0 = rhs_element(i, 0); \ + _rrow1 = *(&(rhs_element(i, 0)) + 1); \ + _lcol1 = lhs_element(1, i); \ + _rrow2 = rhs_element(i, 1); \ + _rrow3 = *(&(rhs_element(i, 1)) + 1); \ + +#define computeCol(j) \ + res(0, j) += lcol0 * rrow##j; \ + res(1, j) += lcol1 * rrow##j; \ + res(2, j) += lcol2 * rrow##j; \ + res(3, j) += lcol3 * rrow##j; \ + res(4, j) += lcol4 * rrow##j; \ + res(5, j) += lcol5 * rrow##j; \ + res(6, j) += lcol6 * rrow##j; \ + res(7, j) += lcol7 * rrow##j; \ + +#define computePass(i) \ + loadData(i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol0 +#undef lcol1 +#undef lcol2 +#undef lcol3 +#undef lcol4 +#undef lcol5 +#undef lcol6 +#undef lcol7 +#undef rrow0 +#undef rrow1 +#undef rrow2 +#undef rrow3 +#undef rrow4 +#undef rrow5 +#undef rrow6 +#undef rrow7 + +#undef computePass +#undef computeCol +#undef loadData +#undef lhs_element +#undef rhs_element + + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (3) Copies the values into new registers using conditional logic. + +#define makeAssignments(i) \ + val0 = res(i, 0); \ + val1 = res(i, 1); \ + val2 = res(i, 2); \ + val3 = res(i, 3); \ + val4 = res(i, 4); \ + val5 = res(i, 5); \ + val6 = res(i, 6); \ + val7 = res(i, 7); \ + + Scalar val0; + Scalar val1; + Scalar val2; + Scalar val3; + Scalar val4; + Scalar val5; + Scalar val6; + Scalar val7; + + switch (threadIdx.x) { + case 0: + makeAssignments(0); + break; + case 1: + makeAssignments(1); + break; + case 2: + makeAssignments(2); + break; + case 3: + makeAssignments(3); + break; + case 4: + makeAssignments(4); + break; + case 5: + makeAssignments(5); + break; + case 6: + makeAssignments(6); + break; + case 7: + makeAssignments(7); + break; + } + +#undef res + + const Index vert_base = base_m + 4 * threadIdx.y + (threadIdx.x % 4) + 32 * (threadIdx.x / 4); + const Index horiz_base = base_n + 4 * threadIdx.z; + + if (!needs_edge_check || vert_base < m_size) { + if (!needs_edge_check || horiz_base + 35 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + output(vert_base, horiz_base + 34) = val6; + output(vert_base, horiz_base + 35) = val7; + } else if (horiz_base + 34 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + output(vert_base, horiz_base + 34) = val6; + } else if (horiz_base + 33 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + } else if (horiz_base + 32 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + } else if (horiz_base + 3 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + } else if (horiz_base + 2 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + } else if (horiz_base + 1 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + } else if (horiz_base < n_size) { + output(vert_base, horiz_base + 0) = val0; + } + } + } + + + template +__global__ void + __launch_bounds__(512) + EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float4 lhs_shmem[(68 * 64) / 4]; + __shared__ float2 rhs_shmem[((66 * 8 + 24) * 8) / 2]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } + } + + + template + struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same::value && + internal::is_same::value) { + EigenFloatContractionKernel + <<m_device.stream()>>>(lhs, rhs, output, m, n, k); + } else { + EigenContractionKernel + <<m_device.stream()>>>(lhs, rhs, output, m, n, k); + } + + assert(cudaGetLastError() == cudaSuccess); + } + }; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index ef5e11537..fad342eab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -104,19 +104,41 @@ struct GpuDevice { EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } - /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { +#ifndef __CUDA_ARCH__ void* result; - cudaMalloc(&result, num_bytes); + assert(cudaMalloc(&result, num_bytes) == cudaSuccess); + assert(result != NULL); return result; +#else + assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif } - /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - cudaFree(buffer); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { +#ifndef __CUDA_ARCH__ + assert(buffer != NULL); + assert(cudaFree(buffer) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - cudaMemsetAsync(buffer, c, n, *stream_); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } EIGEN_STRONG_INLINE size_t numThreads() const { -- cgit v1.2.3 From 152f3218ac9b6941cf6dbc960c2d4a6d1099eb06 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:33:44 -0700 Subject: Improved contraction test --- unsupported/test/cxx11_tensor_contraction.cpp | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index a37fcd967..2b599d30d 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -201,6 +201,37 @@ static void test_full_redux() } +static void test_contraction_of_contraction() +{ + Tensor t1(2, 2); + Tensor t2(2, 2); + Tensor t3(2, 2); + Tensor t4(2, 2); + t1.setRandom(); + t2.setRandom(); + t3.setRandom(); + t4.setRandom(); + + Eigen::array dims({{DimPair(1, 0)}}); + auto contract1 = t1.contract(t2, dims); + auto diff = t3 - contract1; + auto contract2 = t1.contract(t4, dims); + Tensor result = contract2.contract(diff, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 2); + + Eigen::Map m1(t1.data(), 2, 2); + Eigen::Map m2(t2.data(), 2, 2); + Eigen::Map m3(t3.data(), 2, 2); + Eigen::Map m4(t4.data(), 2, 2); + Eigen::MatrixXf expected = (m1 * m4) * (m3 - m1 * m2); + VERIFY_IS_APPROX(result(0, 0), expected(0, 0)); + VERIFY_IS_APPROX(result(0, 1), expected(0, 1)); + VERIFY_IS_APPROX(result(1, 0), expected(1, 0)); + VERIFY_IS_APPROX(result(1, 1), expected(1, 1)); +} + + static void test_expr() { Tensor mat1(2, 3); @@ -328,6 +359,7 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_multidims()); CALL_SUBTEST(test_holes()); CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_contraction_of_contraction()); CALL_SUBTEST(test_expr()); CALL_SUBTEST(test_out_of_order_contraction()); CALL_SUBTEST(test_consistency()); -- cgit v1.2.3 From 95a430a2ca8489a85d0a12ffa66d260011c11745 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:45:19 -0700 Subject: Vector primitives for CUDA --- Eigen/Core | 5 + Eigen/src/Core/arch/CUDA/MathFunctions.h | 75 +++++++++ Eigen/src/Core/arch/CUDA/PacketMath.h | 260 +++++++++++++++++++++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/MathFunctions.h create mode 100644 Eigen/src/Core/arch/CUDA/PacketMath.h diff --git a/Eigen/Core b/Eigen/Core index 776b7faf3..537ac16b2 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -294,6 +294,11 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" +#endif + #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/BinaryFunctors.h" diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 000000000..e7305c01e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_STRONG_INLINE +float4 plog(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 plog(const double2& a) +{ + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 pexp(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 pexp(const double2& a) +{ + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 psqrt(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 psqrt(const double2& a) +{ + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 prsqrt(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 prsqrt(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..5b0abe2e6 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +#ifdef __CUDA_ARCH__ +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return __ldg((const float4*)from); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return __ldg((const double2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return make_double2(__ldg(from+0), __ldg(from+1)); +} +#endif + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, int stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, int stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H -- cgit v1.2.3 From bbce6fa65d8a196f05e0428d014e0e3865e202f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:55:35 -0700 Subject: define EIGEN_VECTORIZE_CUDA when compiling with nvcc --- Eigen/Core | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index 537ac16b2..acdeca5f4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -178,6 +178,11 @@ #endif #endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif -- cgit v1.2.3 From 6c047d398daba5784da35d3b502360a5a7a83f33 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 13:29:36 -0700 Subject: Fixed a comment --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index d42167da9..4d7f9e1fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -18,7 +18,7 @@ namespace Eigen { * \brief The fixed sized version of the tensor class. * * The fixes sized equivalent of - * Eigen::Tensor t(3, 5, 7); + * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; */ -- cgit v1.2.3 From 0a07ac574ead83d314d518127d8d69595f6212b2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 13:32:41 -0700 Subject: Added support for the *= and /* operators to TensorBase --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 2f7c9ecda..90a9cc2c4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -307,11 +307,18 @@ class TensorBase : public TensorBase, const Derived, const OtherDerived>(derived(), other.derived()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const OtherDerived& other) { return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp -- cgit v1.2.3 From 44beee9d68e13dc299c6e2ea321aedc74c23d039 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 14:14:20 -0700 Subject: Removed dead code --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 417717b90..04849dd9f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -12,9 +12,6 @@ namespace Eigen { -template class Stride; - - /** \class TensorMap * \ingroup CXX11_Tensor_Module * -- cgit v1.2.3 From 767424af18a55604496f38dd4593542db97240a1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Oct 2014 15:36:23 -0700 Subject: Improved the functors defined for standard reductions Added a functor to encapsulate the generation of random numbers on cpu and gpu. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 72 ++++++++++++++++++++-- unsupported/test/cxx11_tensor_reduction.cpp | 33 ++++++++++ 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 92984336c..e9aa22183 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,12 +25,12 @@ template struct SumReducer } private: - T m_sum; + typename internal::remove_all::type m_sum; }; template struct MaxReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max((std::numeric_limits::min)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max(-(std::numeric_limits::max)()) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { if (t > m_max) { m_max = t; } } @@ -39,7 +39,7 @@ template struct MaxReducer } private: - T m_max; + typename internal::remove_all::type m_max; }; template struct MinReducer @@ -53,9 +53,73 @@ template struct MinReducer } private: - T m_min; + typename internal::remove_all::type m_min; }; + +#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) +// We're not compiling a cuda kernel +template struct UniformRandomGenerator { + template + T operator()(Index, Index = 0) const { + return random(); + } + template + typename internal::packet_traits::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = random(); + } + return internal::pload::type>(values); + } +}; + +#else + +// We're compiling a cuda kernel +template struct UniformRandomGenerator; + +template <> struct UniformRandomGenerator { + UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template + float operator()(Index, Index = 0) const { + return curand_uniform(&m_state); + } + template + float4 packetOp(Index, Index = 0) const { + return curand_uniform4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct UniformRandomGenerator { + UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template + double operator()(Index, Index = 0) const { + return curand_uniform_double(&m_state); + } + template + double2 packetOp(Index, Index = 0) const { + return curand_uniform2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 27135b982..da9885166 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -139,9 +139,42 @@ static void test_user_defined_reductions() } +static void test_tensor_maps() +{ + int inputs[2*3*5*7]; + TensorMap > tensor_map(inputs, 2,3,5,7); + TensorMap > tensor_map_const(inputs, 2,3,5,7); + const TensorMap > tensor_map_const_const(inputs, 2,3,5,7); + + tensor_map.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor result = tensor_map.sum(reduction_axis); + Tensor result2 = tensor_map_const.sum(reduction_axis); + Tensor result3 = tensor_map_const_const.sum(reduction_axis); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + int sum = 0; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor_map(i, k, j, l); + } + } + VERIFY_IS_EQUAL(result(i, j), sum); + VERIFY_IS_EQUAL(result2(i, j), sum); + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } +} + + void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_simple_reductions()); CALL_SUBTEST(test_full_reductions()); CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_tensor_maps()); } -- cgit v1.2.3 From 498b7eed25bdb3b90f2fc45dd822c96aa08db2f8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Oct 2014 15:39:13 -0700 Subject: Rewrote the TensorBase::random method to support the generation of random number on gpu. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 90a9cc2c4..d4b7846a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -39,9 +39,14 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(derived()); + return TensorCwiseNullaryOp, const Derived>(derived()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random() const { + return TensorCwiseNullaryOp(derived()); } // Coefficient-wise unary operators -- cgit v1.2.3 From a991f94c0e5c51555875564ce58681a82d07cd69 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 15:20:37 -0700 Subject: Fixed the thread pool test --- test/main.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/main.h b/test/main.h index b504970f3..9cb41c828 100644 --- a/test/main.h +++ b/test/main.h @@ -47,8 +47,8 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses +//#define min(A,B) please_protect_your_min_with_parentheses +//#define max(A,B) please_protect_your_max_with_parentheses #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index faf965df8..84768ca09 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -131,7 +131,7 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - vector > results; + std::vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 75423f516..1c4d0838a 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -122,5 +122,5 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 2e67b2064..e02d8e4be 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,9 +17,9 @@ using Eigen::Tensor; void test_cxx11_tensor_thread_pool() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); + Eigen::Tensor in1(2,3,7); + Eigen::Tensor in2(2,3,7); + Eigen::Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); @@ -30,7 +30,7 @@ void test_cxx11_tensor_thread_pool() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); } } } -- cgit v1.2.3 From 4b36c3591f247d4be38e5a12dbed7ac0d1ad2bff Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 15:43:21 -0700 Subject: Fixed the tensor shuffling test --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 133 ++++++++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 +- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 2 +- unsupported/test/cxx11_tensor_shuffling.cpp | 9 +- 5 files changed, 141 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 11590b474..732c6b344 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -37,8 +37,7 @@ template struct IndexPair { Index second; }; - -// Boiler plate code +// Boilerplate code namespace internal { template struct dget { @@ -110,6 +109,11 @@ struct Sizes : internal::numeric_list { } }; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +} + #else template @@ -136,9 +140,21 @@ template Sizes(DenseIndex... indices) { } explicit Sizes(std::initializer_list l) { // todo: add assertion } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + } #endif template Sizes& operator = (const T& other) { @@ -156,9 +172,14 @@ template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +}; + #endif -// Boiler plate +// Boilerplate namespace internal { template struct tensor_index_linearization_helper @@ -243,6 +264,112 @@ struct DSizes : array { }; + + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_vsize_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const&) + { + return array_get(indices); + } +}; +} // end namespace internal + +template +struct VSizes : std::vector { + typedef std::vector Base; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { + return internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC VSizes() { } + EIGEN_DEVICE_FUNC explicit VSizes(const std::vector& a) : Base(a) { } + + template + EIGEN_DEVICE_FUNC explicit VSizes(const array& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) { + this->resize(1); + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) { + this->resize(2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + this->resize(3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + this->resize(4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + this->resize(5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } + + VSizes& operator = (const std::vector& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { + return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { + return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + + +// Boilerplate +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes& sizes) { + DenseIndex total_size = 1; + for (int i = 0; i < sizes.size(); ++i) { + total_size *= sizes[i]; + } + return total_size; +} +} + namespace internal { template struct array_size > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 84768ca09..10f5a5ee7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -39,7 +39,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); } @@ -60,7 +60,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; const int VectorizedSize = (size / PacketSize) * PacketSize; @@ -122,7 +122,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; @@ -176,7 +176,7 @@ class TensorExecutor const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); EigenMetaKernel > <<>>(evaluator, size); assert(cudaGetLastError() == cudaSuccess); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 1c4d0838a..ac2ccaf27 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -119,7 +119,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") -# ei_add_test(cxx11_tensor_shuffling "-std=c++0x") + ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index d270486f2..b0501aaa3 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -179,7 +179,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(mat3(array(i,j,k)), powf(val, 3.5f)); + VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f)); val += 1.0; } } diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 5ab8b6821..39c623499 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::array; static void test_simple_shuffling() { @@ -80,10 +81,10 @@ static void test_expr_shuffling() Tensor result(5,7,3,2); - array src_slice_dim(Eigen::array(2,3,1,7)); - array src_slice_start(Eigen::array(0,0,0,0)); - array dst_slice_dim(Eigen::array(1,7,3,2)); - array dst_slice_start(Eigen::array(0,0,0,0)); + array src_slice_dim{{2,3,1,7}}; + array src_slice_start{{0,0,0,0}}; + array dst_slice_dim{{1,7,3,2}}; + array dst_slice_start{{0,0,0,0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = -- cgit v1.2.3 From 2ed1838aeb6d3c70c35dbd8d545fba1e7e1c68dc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 16:11:27 -0700 Subject: Added support for tensor chips --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 232 ++++++++++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 3 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_chipping.cpp | 244 +++++++++++++++++++++ 6 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h create mode 100644 unsupported/test/cxx11_tensor_chipping.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index b1bd2f676..5a6246a03 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -47,6 +47,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index d4b7846a0..cadeb3b19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -254,6 +254,11 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorPaddingOp pad(const PaddingDimensions& padding) const { @@ -327,7 +332,7 @@ class TensorBase : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp - reshape(const NewDimensions& newDimensions) { + reshape(const NewDimensions& newDimensions) const { return TensorReshapingOp(derived(), newDimensions); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -335,6 +340,11 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 000000000..9ecea9108 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorChippingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorChippingOp type; +}; + +} // end namespace internal + + + +template +class TensorChippingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset) + : m_xpr(expr), m_offset(offset) {} + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = false, // not yet implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device) + { + // We could also support the case where NumInputDims==1 if needed. + EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumInputDims > DimId, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != DimId) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + for (int i = 0; i < DimId; ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + m_inputStride *= input_dims[DimId]; + m_inputOffset = m_stride * op.offset(); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + /* to be done + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + + }*/ + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if (DimId == 0) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if (DimId == NumInputDims-1) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const Device& m_device; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + /* to be done + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + } */ +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index bc67586a4..86ddd1ae8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,11 +21,12 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorBroadcastingOp; template class TensorReductionOp; template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; +template class TensorBroadcastingOp; +template class TensorChippingOp; template class TensorReshapingOp; template class TensorSlicingOp; template class TensorPaddingOp; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index ac2ccaf27..48435eb9c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -115,6 +115,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") + ei_add_test(cxx11_tensor_chipping "-std=c++0x") ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp new file mode 100644 index 000000000..8c8a0cec2 --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -0,0 +1,244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_simple_chip() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor chip1; + chip1 = tensor.chip<0>(1); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor chip2 = tensor.chip<1>(1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor chip3 = tensor.chip<2>(2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor chip4(tensor.chip<3>(5)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor chip5(tensor.chip<4>(7)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} + + +static void test_chip_in_expr() { + Tensor input1(2,3,5,7,11); + input1.setRandom(); + Tensor input2(3,5,7,11); + input2.setRandom(); + + Tensor result = input1.chip<0>(0) + input2; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + float expected = input1(0,i,j,k,l) + input2(i,j,k,l); + VERIFY_IS_EQUAL(result(i,j,k,l), expected); + } + } + } + } + + Tensor input3(3,7,11); + input3.setRandom(); + Tensor result2 = input1.chip<0>(0).chip<1>(2) + input3; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 11; ++k) { + float expected = input1(0,i,2,j,k) + input3(i,j,k); + VERIFY_IS_EQUAL(result2(i,j,k), expected); + } + } + } +} + + +static void test_chip_as_lvalue() +{ + Tensor input1(2,3,5,7,11); + input1.setRandom(); + + Tensor input2(3,5,7,11); + input2.setRandom(); + Tensor tensor = input1; + tensor.chip<0>(1) = input2; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + Tensor input3(2,5,7,11); + input3.setRandom(); + tensor = input1; + tensor.chip<1>(1) = input3; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (j != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); + } + } + } + } + } + } + + Tensor input4(2,3,7,11); + input4.setRandom(); + tensor = input1; + tensor.chip<2>(3) = input4; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (k != 3) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); + } + } + } + } + } + } + + Tensor input5(2,3,5,11); + input5.setRandom(); + tensor = input1; + tensor.chip<3>(4) = input5; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (l != 4) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); + } + } + } + } + } + } + + Tensor input6(2,3,5,7); + input6.setRandom(); + tensor = input1; + tensor.chip<4>(5) = input6; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (m != 5) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); + } + } + } + } + } + } +} + + +void test_cxx11_tensor_chipping() +{ + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_as_lvalue()); +} -- cgit v1.2.3 From 0219f8aed44279858330b1c07402c066f5b75459 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 16:17:26 -0700 Subject: Added ability to print a tensor using an iostream. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 44 +++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_io.cpp | 70 +++++++++++++++++++++++++++ 4 files changed, 117 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIO.h create mode 100644 unsupported/test/cxx11_tensor_io.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 5a6246a03..79510fd96 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -64,6 +64,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" + #include "Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 000000000..959b5db73 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +template +std::ostream& operator << (std::ostream& os, const TensorBase& expr) { + // Evaluate the expression if needed + TensorForcedEvalOp eval = expr.eval(); + TensorEvaluator, DefaultDevice> tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + typedef typename T::Scalar Scalar; + typedef typename T::Index Index; + typedef typename TensorEvaluator, DefaultDevice>::Dimensions Dimensions; + const Index total_size = internal::array_prod(tensor.dimensions()); + + // Print the tensor as a 1d vector or a 2d matrix. + if (internal::array_size::value == 1) { + Map > array(tensor.data(), total_size); + os << array; + } else { + const Index first_dim = tensor.dimensions()[0]; + Map > matrix(tensor.data(), first_dim, total_size/first_dim); + os << matrix; + } + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 48435eb9c..99593b562 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -124,4 +124,5 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_io "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp new file mode 100644 index 000000000..b73c024f5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -0,0 +1,70 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include +#include + + +static void test_output_1d() +{ + Tensor tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + std::stringstream os; + os << tensor; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +static void test_output_2d() +{ + Tensor tensor(5, 3); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 3; ++j) { + tensor(i, j) = i*j; + } + } + + std::stringstream os; + os << tensor; + + std::string expected("0 0 0\n0 1 2\n0 2 4\n0 3 6\n0 4 8"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +static void test_output_expr() +{ + Tensor tensor1(5); + Tensor tensor2(5); + for (int i = 0; i < 5; ++i) { + tensor1(i) = i; + tensor2(i) = 7; + } + + std::stringstream os; + os << tensor1 + tensor2; + + std::string expected(" 7\n 8\n 9\n10\n11"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +void test_cxx11_tensor_io() +{ + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_expr()); +} -- cgit v1.2.3 From 4c70b0a7627d45286ecbb3c73d2d774412168205 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 10:04:04 -0700 Subject: Added support for patch extraction --- unsupported/Eigen/CXX11/Tensor | 7 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 212 +++++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_patch.cpp | 103 ++++++++++ 6 files changed, 330 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h create mode 100644 unsupported/test/cxx11_tensor_patch.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 79510fd96..0dac95e45 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner // Copyright (C) 2013 Christian Seiler // // This Source Code Form is subject to the terms of the Mozilla @@ -27,6 +28,11 @@ #include #include +#include + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#include +#endif #include "Eigen/Core" @@ -46,6 +52,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index cadeb3b19..27c10f64f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -243,6 +243,12 @@ class TensorBase return TensorConcatenationOp(derived(), other.derived(), axis); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 86ddd1ae8..67f478822 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,6 +25,7 @@ template class TensorReductionOp; template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; +template class TensorPatchOp; template class TensorBroadcastingOp; template class TensorChippingOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 000000000..01f2daf52 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPatchOp type; +}; + +} // end namespace internal + + + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[NumDims - 1]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1]; + + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index indices[2] = {index, index + packetSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1], + indices[1] / m_outputStrides[NumDims - 1]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1], + indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]}; + + Index inputIndices[2] = {0, 0}; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 99593b562..d6c435947 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -119,6 +119,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_patch "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp new file mode 100644 index 000000000..e2ba5bfd8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + Tensor no_patch; + no_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + Tensor twod_patch; + twod_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + Tensor threed_patch; + threed_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } + } + } + } + } + } + } +} + + +void test_cxx11_tensor_patch() +{ + CALL_SUBTEST(test_simple_patch()); + // CALL_SUBTEST(test_expr_shuffling()); +} -- cgit v1.2.3 From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 17:02:09 -0700 Subject: Misc improvements and cleanups --- Eigen/src/Core/GenericPacketMath.h | 15 +- unsupported/Eigen/CXX11/Tensor | 4 + .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 5 + .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 101 ++++++++- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 35 ++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 73 ++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 20 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 36 +++- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 26 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 22 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 61 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 32 +-- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_assign.cpp | 35 +++- unsupported/test/cxx11_tensor_convolution.cpp | 70 +++++++ unsupported/test/cxx11_tensor_device.cpp | 27 +++ unsupported/test/cxx11_tensor_morphing.cpp | 5 +- unsupported/test/cxx11_tensor_of_complex.cpp | 64 ++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 232 ++++++++++++++++++++- 29 files changed, 780 insertions(+), 141 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e6fea5bba..3ef3475c7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -359,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template -inline Packet ploadt(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) { if(LoadMode == Aligned) return pload(from); @@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template +inline Packet ploadt_ro(const typename unpacket_traits::type* from) +{ + return ploadt(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template struct palign_impl diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0dac95e45..2137f4276 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,6 +30,10 @@ #include #include +#ifdef EIGEN_USE_THREADS +#include +#endif + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #include #endif diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 227522ecb..e30eb6ad8 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,11 @@ template constexpr inline T& array_ template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::vector& a) { return a[I]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } + + #undef STD_GET_ARR_HACK template struct array_size; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 4c6b95773..e45d0a3b1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -48,7 +48,8 @@ template class array { values[2] = v3; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -56,7 +57,8 @@ template class array { values[3] = v4; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -64,6 +66,43 @@ template class array { values[3] = v4; values[4] = v5; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { @@ -93,9 +132,11 @@ template struct type_list { struct null_type { }; -template +template struct make_type_list { - typedef typename make_type_list::type tailresult; + typedef typename make_type_list::type tailresult; typedef type_list type; }; @@ -150,6 +191,23 @@ template struct gen_numeric_list_repeated { typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; }; +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val, type2val >::type type; +}; + template struct get; @@ -174,6 +232,7 @@ template <> struct arg_prod { static const int value = 1; }; + template array repeat(t v) { array array; @@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l return get >::value; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod::value; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; @@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { + return a[I]; +} +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; template struct array_size; template struct array_size > { static const size_t value = N; }; - +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; struct sum_op { template static inline bool run(A a, B b) { return a + b; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 3bfe80c9e..e973c00d3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -131,8 +131,8 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 27c10f64f..6018ecc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,6 +30,12 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + // Dimensions + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -187,7 +193,7 @@ class TensorBase } // Contractions. - typedef std::pair DimensionPair; + typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorContractionOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 3b2a9c8b9..0e55d4de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorBroadcastingOp : public TensorBase, WriteAccessors> +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -91,7 +91,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { values[i] = coeff(originalIndex+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4a5fd9c79..34bdd5309 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -872,11 +872,19 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 75519c9f5..649bdb308 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -38,6 +38,18 @@ template class TensorDevice { return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const DeviceType& m_device; ExpressionType& m_expression; @@ -58,6 +70,18 @@ template class TensorDevice + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const ThreadPoolDevice& m_device; ExpressionType& m_expression; @@ -79,6 +103,17 @@ template class TensorDevice return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const GpuDevice& m_device; ExpressionType m_expression; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index fad342eab..5a6ff70e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,23 +37,41 @@ struct DefaultDevice { // Multiple cpu cores // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS + +typedef std::future Future; + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } - size_t numThreads() const { return num_threads_; } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + template + EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const { + return std::async(std::launch::async, f, args...); + } + template + EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const { + std::async(std::launch::async, f, args...); + } + private: // todo: NUMA, ... size_t num_threads_; @@ -63,41 +81,34 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU -static int m_numMultiProcessors = 0; -static int m_maxThreadsPerBlock = 0; -static int m_maxThreadsPerMultiProcessor = 0; +static cudaDeviceProp m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess); + m_devicePropInitialized = true; + } +} static inline int getNumCudaMultiProcessors() { - if (m_numMultiProcessors == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_numMultiProcessors = deviceProp.multiProcessorCount; - } - return m_numMultiProcessors; + initializeDeviceProp(); + return m_deviceProperties.multiProcessorCount; } static inline int maxCudaThreadsPerBlock() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - } - return m_maxThreadsPerBlock; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerBlock; } static inline int maxCudaThreadsPerMultiProcessor() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - } - return m_maxThreadsPerMultiProcessor; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerMultiProcessor; +} +static inline int sharedMemPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.sharedMemPerBlock; } + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } @@ -141,8 +152,8 @@ struct GpuDevice { #endif } - EIGEN_STRONG_INLINE size_t numThreads() const { - // Fixme: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME return 32; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 732c6b344..2dd8e274b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,7 +29,7 @@ namespace Eigen { * \sa Tensor */ -// Can't use std::pairs on cuda devices +// Can't use std::pair on cuda devices template struct IndexPair { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 587cbd5ca..ce9d73578 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } - EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 0f969036c..e324ba8d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -65,13 +65,13 @@ struct TensorEvaluator return m_data[index]; } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -113,13 +113,17 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else return m_data[index]; +#endif } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_data + index); + return internal::ploadt_ro(m_data + index); } const Scalar* data() const { return m_data; } @@ -166,7 +170,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -219,7 +223,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -278,7 +282,7 @@ struct TensorEvaluator - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -340,7 +344,7 @@ struct TensorEvaluator return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 10f5a5ee7..01fa04c64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorExecutor @@ -62,7 +58,7 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; + const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); @@ -131,10 +127,10 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - std::vector > results; + std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -154,11 +150,31 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; const int step_size = blockDim.x * gridDim.x; - for (int i = first_index; i < size; i += step_size) { - eval.evalScalar(i); + + if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { + // Use the scalar path + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } + } + else { + // Use the vector path + const int PacketSize = unpacket_traits::size; + const int vectorized_step_size = step_size * PacketSize; + const int vectorized_size = (size / PacketSize) * PacketSize; + int i = first_index * PacketSize; + for ( ; i < vectorized_size; i += vectorized_step_size) { + eval.evalPacket(i); + } + for ( ; i < size; i += step_size) { + eval.evalScalar(i); + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 4d7f9e1fd..a753c5a48 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief The fixed sized version of the tensor class. * - * The fixes sized equivalent of + * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; @@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase::size > 1), }; typedef Dimensions_ Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index cf97031be..2714117ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -31,30 +31,34 @@ namespace internal { template struct TensorIntDivisor { public: - TensorIntDivisor() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { multiplier = 0; shift1 = 0; shift2 = 0; } // Must have 1 <= divider <= 2^31-1 - TensorIntDivisor(const T divider) { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; eigen_assert(divider > 0); eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 +#ifndef __CUDA_ARCH__ const int leading_zeros = __builtin_clz(divider); - const int l = N - (leading_zeros+1); - - multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; - shift1 = (std::min)(1, l); - shift2 = (std::max)(0, l-1); +#else + const int leading_zeros = __clz(divider); +#endif + const int log_div = N - (leading_zeros+1); + + multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; } // Must have 0 <= numerator <= 2^32-1 - T divide(const T numerator) const { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; eigen_assert(numerator >= 0); eigen_assert(numerator <= (1ull< -static T operator / (const T& numerator, const TensorIntDivisor& divisor) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 04849dd9f..2c0d2cd0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -42,26 +42,25 @@ template class TensorMap : public Tensor static const int Options = Options_; - static const std::size_t NumIndices = PlainObjectType::NumIndices; + static const Index NumIndices = PlainObjectType::NumIndices; typedef typename PlainObjectType::Dimensions Dimensions; - enum { - IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), - PacketAccess = true, + IsAligned = ((int(Options_)&Aligned)==Aligned), + PacketAccess = (internal::packet_traits::size > 1), }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif @@ -176,12 +175,13 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7da89458f..8da6e0f26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -144,7 +144,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -206,7 +206,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f7e7fc107..7e0063626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; @@ -194,7 +194,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 0c4f8a3d6..aaec39756 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -30,11 +30,11 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template +template class TensorStorage { private: @@ -62,7 +62,7 @@ class TensorStorage // pure-dynamic, but without specification of all dimensions explicitly -template +template class TensorStorage : public TensorStorage::type> { @@ -79,7 +79,7 @@ class TensorStorage }; // pure dynamic -template +template class TensorStorage::type> { T *m_data; @@ -140,6 +140,7 @@ class TensorStorage, 1, typename eval -class TensorStridingOp : public TensorBase, WriteAccessors> +class TensorStridingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,28 +109,23 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } - } - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] *= op.strides()[i]; + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } - // typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -150,16 +145,44 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } Scalar* data() const { return NULL; } protected: - // Strides m_strides; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 40f805741..5940a8cf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -70,14 +70,18 @@ struct traits > }; -template -struct traits > +template +struct traits > : public traits { typedef traits BaseTraits; typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + enum { + Options = Options_, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; }; @@ -105,16 +109,16 @@ struct eval, Eigen::Dense> typedef const TensorFixedSize& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; template @@ -141,16 +145,16 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6c435947..a7ef2b402 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_complex "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index f2b126413..0ac3f9bf9 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -253,6 +253,39 @@ static void test_auto_resize() } +static void test_compound_assign() +{ + Tensor start_tensor(10); + Tensor offset_tensor(10); + start_tensor.setRandom(); + offset_tensor.setRandom(); + + Tensor tensor = start_tensor; + tensor += offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i)); + } + + tensor = start_tensor; + tensor -= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i)); + } + + tensor = start_tensor; + tensor *= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i)); + } + + tensor = start_tensor; + tensor /= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i)); + } +} + + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); @@ -260,5 +293,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); - + CALL_SUBTEST(test_compound_assign()); } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index bafe73edd..4672db463 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -64,8 +64,78 @@ static void test_expr() } +static void test_modes() { + Tensor input(3); + Tensor kernel(3); + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + const Eigen::array dims{{0}}; + Eigen::array, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor valid(1); + valid = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor same(3); + same = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + Tensor full(5); + full = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); +} + + +static void test_strides() { + Tensor input(13); + Tensor kernel(3); + input.setRandom(); + kernel.setRandom(); + + const Eigen::array dims{{0}}; + const Eigen::array stride_of_3{{3}}; + const Eigen::array stride_of_2{{2}}; + + Tensor result; + result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + + + + void test_cxx11_tensor_convolution() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_modes()); + CALL_SUBTEST(test_strides()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index f331cb481..26465ee11 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context) context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } +template +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + template static void test_contraction(Context* context) { @@ -197,6 +205,15 @@ static void test_cpu() { } } + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { @@ -299,6 +316,16 @@ static void test_gpu() { } } + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 2a6a97856..fd1b1fa32 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::IndexPair; static void test_simple_reshape() { @@ -52,7 +53,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -125,7 +126,7 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + array, 1> contract_along{{IndexPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp new file mode 100644 index 000000000..b5044b962 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -0,0 +1,64 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::TensorMap; + + + +static void test_additions() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = std::complex(i, -i); + data2(i) = std::complex(i, 7 * i); + } + + Tensor, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + VERIFY_IS_EQUAL(sum(i), std::complex(2*i, 6*i)); + } +} + + +static void test_contractions() +{ + Tensor, 4> t_left(30, 50, 8, 31); + Tensor, 5> t_right(8, 31, 7, 20, 10); + Tensor, 5> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map, Dynamic, Dynamic>> MapXcf; + MapXcf m_left(t_left.data(), 1500, 248); + MapXcf m_right(t_right.data(), 248, 1400); + Matrix, Dynamic, Dynamic> m_result(1500, 1400); + + // This contraction should be equivalent to a regular matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_of_complex() +{ + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_contractions()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e02d8e4be..f0de61f8b 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,22 +9,23 @@ #define EIGEN_USE_THREADS - +#include #include "main.h" #include + using Eigen::Tensor; -void test_cxx11_tensor_thread_pool() +static void test_multithread_elementwise() { - Eigen::Tensor in1(2,3,7); - Eigen::Tensor in2(2,3,7); - Eigen::Tensor out(2,3,7); + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { @@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool() } } } + + +static void test_multithread_compound_assignment() +{ + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); + out.device(thread_pool_device) = in1; + out.device(thread_pool_device) += in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + + +static void test_multithread_contraction() +{ + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 1147); + MapXf m_right(t_right.data(), 1147, 1400); + MatrixXf m_result(1500, 1400); + + Eigen::ThreadPoolDevice thread_pool_device(4); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_contraction_corner_cases() +{ + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); + + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result = t_result.constant(NAN); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims{{DimPair(0, 0)}}; + + typedef Map MapXf; + MapXf m_left(t_left.data(), 32, 500); + MapXf m_right(t_right.data(), 32, 28*28); + MatrixXf m_result(500, 28*28); + + Eigen::ThreadPoolDevice thread_pool_device(12); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left.transpose() * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_result.resize (1, 28*28); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 500); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (500, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 500); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (1, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_multithread_contraction_agrees_with_singlethread() { + int contract_size = internal::random(1, 5000); + + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); + + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 2)}}); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result(st_result.dimensions()); + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + + +static void test_memcpy() { + + for (int i = 0; i < 5; ++i) { + const int num_threads = internal::random(3, 11); + Eigen::ThreadPoolDevice thread_pool_device(num_threads); + + const int size = internal::random(13, 7632); + Tensor t1(size); + t1.setRandom(); + std::vector result(size); + thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); + for (int i = 0; i < size; i++) { + VERIFY_IS_EQUAL(t1(i), result[i]); + } + } +} + + +void test_cxx11_tensor_thread_pool() +{ + CALL_SUBTEST(test_multithread_elementwise()); + CALL_SUBTEST(test_multithread_compound_assignment()); + + CALL_SUBTEST(test_multithread_contraction()); + + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + + // Exercise various cases that have been problematic in the past. + CALL_SUBTEST(test_contraction_corner_cases()); + + CALL_SUBTEST(test_memcpy()); +} -- cgit v1.2.3 From dba55041ab62961e549ea58778dffa3eaa0cbdb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 11:20:36 -0700 Subject: Added support for promises Started to improve multithreaded contractions --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorContractionThreadPool.h | 351 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 11 + 3 files changed, 363 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 2137f4276..7ec60044e 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,6 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 000000000..dc0513305 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { +namespace internal { + +template +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template +struct packRhsAndKernelArg { + const std::vector* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + std::vector* kernel_promises; + const std::vector* lhs_futures; + const bool need_to_pack; +}; + +} // end namespace internal + + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs LhsPacker; + typedef internal::gemm_pack_rhs RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel GebpKernel; + + typedef internal::packLhsArg packLArg; + typedef internal::packRhsAndKernelArg packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + LhsPacker pack_lhs; + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + Index mc = m; + Index nc = n; + Index kc = k; + internal::computeProductBlockingSizes(kc, mc, nc/*, num_threads*/); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const int sizeA = mc * kc; + const int sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = (std::min)(num_threads, m_blocks); + std::vector blockAs; + blockAs.reserve(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + std::vector blockBs; + blockBs.reserve(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_futures starts with all null futures + std::vector lhs_futures(num_threads); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_promises = num_threads * n_blocks; + Promise p; + p.set_value(); + std::vector kernel_promises(num_kernel_promises, p); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + for (int i = 0; i < n_blocks; ++i) { + int future_id = (blockAId * n_blocks + i); + wait_until_ready(&kernel_promises[future_id]); + kernel_promises[future_id] = Promise(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + lhs_futures[blockAId] = + this->m_device.enqueue(&Self::packLhs, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (int i = num_blocks; i < num_threads; ++i) { + int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + int future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(&kernel_promises[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_promises, // kernel_promises + &lhs_futures, // lhs_futures + need_to_pack, // need_to_pack + }; + + typedef decltype(Self::packRhsAndKernel) Func; + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + } + } + } + + // collect the last frame of kernel futures + for (int i = 0; i < kernel_promises.size(); ++i) { + wait_until_ready(&kernel_promises[i]); + } + + // deallocate all of the memory for both A and B's + for (int i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (int i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + + wait_until_ready(&(*arg.lhs_futures)[blockAId]); + const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); + + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + eigen_assert(!(*arg.kernel_promises)[set_idx].ready()); + (*arg.kernel_promises)[set_idx].set_value(); + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 5a6ff70e9..3748879cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -39,6 +39,17 @@ struct DefaultDevice { #ifdef EIGEN_USE_THREADS typedef std::future Future; +typedef std::promise Promise; + +static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) { + f->wait(); + // eigen_assert(f->ready()); +} + +static EIGEN_STRONG_INLINE void wait_until_ready(Promise* p) { + p->get_future().wait(); + // eigen_assert(p->get_future().ready()); +} struct ThreadPoolDevice { ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } -- cgit v1.2.3 From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 15:32:59 -0700 Subject: Made the blocking computation aware of the l3 cache Also optimized the blocking parameters to take into account the number of threads used for a computation --- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 124 ++++++++++++++------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +-- .../Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 4 +- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 6 +- Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 +- Eigen/src/Core/products/TriangularSolverMatrix.h | 4 +- blas/level3_impl.h | 12 +- test/product_large.cpp | 7 +- unsupported/Eigen/CXX11/Tensor | 2 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 13 +-- 13 files changed, 117 insertions(+), 79 deletions(-) diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index ef17f288e..e158e3162 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 090c8f4e6..b91786037 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min(k,240); -// n = std::min(n,3840/sizeof(RhsScalar)); -// m = std::min(m,3840/sizeof(RhsScalar)); - - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif + } } template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes(k, m, n); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_FUSE_CJMADD @@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhsm_mc = ActualRows; this->m_nc = ActualCols; @@ -331,21 +331,21 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes(this->m_kc, m, this->m_nc); + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes(this->m_kc, this->m_mc, n); + computeProductBlockingSizes(this->m_kc, m, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; @@ -451,7 +451,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index daa8a1d8a..8de39f76f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 4079063eb..837e69415 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9e6084c3..21f8175d2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 77aa3e5ee..4cbb79da0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -412,7 +412,7 @@ struct TriangularProduct Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space blocking(*m,*n,*k,true); + internal::gemm_blocking_space blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda(10000,20000); - std::ptrdiff_t l2 = internal::random(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random(100000,200000); + std::ptrdiff_t l3 = internal::random(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random(10,100)*16; std::ptrdiff_t m1 = internal::random(10,100)*16; std::ptrdiff_t n1 = internal::random(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes(k1,m1,n1); + internal::computeProductBlockingSizes(k1,m1,n1,1); } { diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7ec60044e..47447f446 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,7 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" -//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1e6f276e0..cd992daab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -766,7 +766,7 @@ struct TensorEvaluator BlockingType; // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, true); + BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); const Index mc = (std::min)(m, blocking.mc()); const Index nc = (std::min)(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index dc0513305..8e4c7c11d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -152,7 +152,7 @@ struct TensorEvaluator(kc, mc, nc/*, num_threads*/); + internal::computeProductBlockingSizes(kc, mc, nc, num_threads); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); @@ -197,9 +197,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises, p); + std::vector kernel_promises(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; @@ -275,8 +276,7 @@ struct TensorEvaluator) Func; - this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); } } } @@ -338,7 +338,6 @@ struct TensorEvaluator Date: Thu, 16 Oct 2014 10:10:04 -0700 Subject: Avoid calling get_future() more than once on a given promise. --- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 13 ++++++++----- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h | 5 ----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 8e4c7c11d..cf1352a31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -198,8 +198,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises); + std::vector kernel_futures(num_kernel_promises); for (int i = 0; i < kernel_promises.size(); ++i) { kernel_promises[i].set_value(); + kernel_futures[i] = kernel_promises[i].get_future(); } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { @@ -218,8 +220,9 @@ struct TensorEvaluatorready()); } -static EIGEN_STRONG_INLINE void wait_until_ready(Promise* p) { - p->get_future().wait(); - // eigen_assert(p->get_future().ready()); -} - struct ThreadPoolDevice { ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } -- cgit v1.2.3 From 94e47798f4e462b857a00b4ca60c954c71d16605 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 10:41:07 -0700 Subject: Fixed the return types of unary and binary expressions to properly handle the case where it is different from the input type (e.g. abs(complex)) --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 16 ++++++++-------- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 7 ++----- unsupported/test/cxx11_tensor_of_complex.cpp | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index e324ba8d2..131326615 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -155,8 +155,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -203,8 +203,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -257,8 +257,8 @@ struct TensorEvaluator::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -317,8 +317,8 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index de66da13f..6e5503de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -84,9 +84,7 @@ struct traits > typedef typename result_of< UnaryOp(typename XprType::Scalar) >::type Scalar; - typedef typename result_of< - UnaryOp(typename XprType::Packet) - >::type Packet; + typedef typename internal::packet_traits::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -188,8 +186,7 @@ class TensorCwiseBinaryOp : public TensorBase::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index b5044b962..24b2bcb58 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -32,6 +32,22 @@ static void test_additions() } +static void test_abs() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + data1.setRandom(); + data2.setRandom(); + + Tensor abs1 = data1.abs(); + Tensor abs2 = data2.abs(); + for (int i = 0; i < 3; ++i) { + VERIFY_IS_APPROX(abs1(i), std::abs(data1(i))); + VERIFY_IS_APPROX(abs2(i), std::abs(data2(i))); + } +} + + static void test_contractions() { Tensor, 4> t_left(30, 50, 8, 31); @@ -60,5 +76,6 @@ static void test_contractions() void test_cxx11_tensor_of_complex() { CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_abs()); CALL_SUBTEST(test_contractions()); } -- cgit v1.2.3 From ae697b471c0d3961ebdb633e30046e5fe31fbe24 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 14:52:50 -0700 Subject: Silenced a few compilation warnings Generalized a TensorMap constructor --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 10 +++++----- 13 files changed, 24 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 879057f38..ceed09505 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner // Copyright (C) 2013 Christian Seiler // // This Source Code Form is subject to the terms of the Mozilla @@ -82,7 +83,7 @@ class Tensor : public TensorBase > static const std::size_t NumIndices = NumIndices_; - typedef DSizes Dimensions; + typedef DSizes Dimensions; protected: TensorStorage m_storage; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 0e55d4de1..2bd158dac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -114,7 +114,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 9ecea9108..3aa3eba24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index b8e43f484..74485b15b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -140,7 +140,7 @@ struct TensorEvaluator m_outputStrides; array m_leftStrides; array m_rightStrides; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Axis m_axis; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cd992daab..0db34adb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -671,10 +671,10 @@ struct TensorContractionEvaluatorBase Index m_j_size; Index m_k_size; - const Device& m_device; - Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Device& m_device; + Scalar* m_result; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 34bdd5309..50cb10a33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -230,7 +230,7 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 2c0d2cd0f..0a8c10ac7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -64,7 +64,8 @@ template class TensorMap : public Tensor } #endif - inline TensorMap(PointerArgType dataPtr, const array& dimensions) + template + inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 13109f514..686bf5c24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -130,8 +130,8 @@ struct TensorEvaluator, Device> Scalar* data() const { return m_impl.data(); } protected: - NewDimensions m_dimensions; TensorEvaluator m_impl; + NewDimensions m_dimensions; }; @@ -381,13 +381,13 @@ struct TensorEvaluator, Devi return inputIndex; } - Dimensions m_dimensions; array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; - const StartIndices m_offsets; TensorEvaluator m_impl; const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 8da6e0f26..89c0cff05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -215,11 +215,11 @@ struct TensorEvaluator, Device return rslt; } - PaddingDimensions m_padding; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; + PaddingDimensions m_padding; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 01f2daf52..e2fe32d67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eef992106..cbe87394b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -152,7 +152,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -217,8 +217,8 @@ struct TensorEvaluator, Device> array m_preservedStrides; array m_reducedStrides; array m_reducedDims; - Op m_reducer; TensorEvaluator m_impl; + Op m_reducer; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 7e0063626..831a9f005 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -131,7 +131,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index b0501aaa3..99ffc7f07 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -32,10 +32,10 @@ static void test_1d() vec1(5) = 42.0; vec2(5) = 5.0; float data3[6]; - TensorMap > > vec3(data3, 6); + TensorMap > > vec3(data3, Sizes<6>()); vec3 = vec1.sqrt(); float data4[6]; - TensorMap, RowMajor> > vec4(data4, 6); + TensorMap, RowMajor> > vec4(data4, Sizes<6>()); vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); @@ -68,9 +68,9 @@ static void test_1d() static void test_2d() { float data1[6]; - TensorMap >> mat1(data1,2,3); + TensorMap >> mat1(data1, Sizes<2, 3>()); float data2[6]; - TensorMap, RowMajor>> mat2(data2,2,3); + TensorMap, RowMajor>> mat2(data2, Sizes<2, 3>()); VERIFY_IS_EQUAL((mat1.size()), 2*3); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); @@ -166,7 +166,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - mat1(array(i,j,k)) = val; + mat1(array{{i,j,k}}) = val; val += 1.0; } } -- cgit v1.2.3 From 65af852b54afca3c76c978c1bfd27d8a1451cab6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 15:02:30 -0700 Subject: Silenced one last warning --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0db34adb1..c530b27a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int i) { } + EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { -- cgit v1.2.3 From 7acd38d19e2f9559825c78b4be8644f3b10496fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 17 Oct 2014 09:49:03 -0700 Subject: Created some benchmarks for the tensor code --- bench/btl/CMakeLists.txt | 1 + bench/btl/libs/tensors/CMakeLists.txt | 44 ++++++++++++ bench/btl/libs/tensors/main_linear.cpp | 23 +++++++ bench/btl/libs/tensors/main_matmat.cpp | 21 ++++++ bench/btl/libs/tensors/main_vecmat.cpp | 21 ++++++ bench/btl/libs/tensors/tensor_interface.hh | 105 +++++++++++++++++++++++++++++ unsupported/Eigen/CXX11/Core | 2 + 7 files changed, 217 insertions(+) create mode 100644 bench/btl/libs/tensors/CMakeLists.txt create mode 100644 bench/btl/libs/tensors/main_linear.cpp create mode 100644 bench/btl/libs/tensors/main_matmat.cpp create mode 100644 bench/btl/libs/tensors/main_vecmat.cpp create mode 100644 bench/btl/libs/tensors/tensor_interface.hh diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index b299d9899..9444b450c 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -97,6 +97,7 @@ ENABLE_TESTING() add_subdirectory(libs/eigen3) add_subdirectory(libs/eigen2) +add_subdirectory(libs/tensors) add_subdirectory(libs/BLAS) add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt new file mode 100644 index 000000000..09d6d8e43 --- /dev/null +++ b/bench/btl/libs/tensors/CMakeLists.txt @@ -0,0 +1,44 @@ + + +if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR) + # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version + set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR}) + set(TENSOR_FOUND TRUE) +else() + find_package(Tensor) +endif() + +if (TENSOR_FOUND) + + include_directories(${TENSOR_INCLUDE_DIR}) + btl_add_bench(btl_tensor_linear main_linear.cpp) + btl_add_bench(btl_tensor_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + + option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) + if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) + btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp) + btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + endif() + + + if(NOT BTL_NOVEC) + btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF) + btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF) + btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF) + btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + + endif(NOT BTL_NOVEC) + +endif (TENSOR_FOUND) diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp new file mode 100644 index 000000000..e257f1e72 --- /dev/null +++ b/bench/btl/libs/tensors/main_linear.cpp @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp new file mode 100644 index 000000000..675fcfc6d --- /dev/null +++ b/bench/btl/libs/tensors/main_matmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp new file mode 100644 index 000000000..1af00c81b --- /dev/null +++ b/bench/btl/libs/tensors/main_vecmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MV,MAX_MV,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh new file mode 100644 index 000000000..97b8e0f0b --- /dev/null +++ b/bench/btl/libs/tensors/tensor_interface.hh @@ -0,0 +1,105 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#ifndef TENSOR_INTERFACE_HH +#define TENSOR_INTERFACE_HH + +#include +#include +#include "btl.hh" + +using namespace Eigen; + +template +class tensor_interface +{ +public : + typedef real real_type; + typedef typename Eigen::Tensor::Index Index; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef Eigen::Tensor gene_matrix; + typedef Eigen::Tensor gene_vector; + + + static inline std::string name( void ) + { + return EIGEN_MAKESTRING(BTL_PREFIX); + } + + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} + + static void free_vector(gene_vector & /*B*/) {} + + static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(Eigen::array(A_stl[0].size(), A_stl.size())); + + for (unsigned int j=0; j(i,j)) = A_stl[j][i]; + } + } + } + + static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){ + B.resize(B_stl.size()); + + for (unsigned int i=0; i(i,j)); + } + } + } + + static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ + Y += X.constant(coef) * X; + } + + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ + Y = X.constant(a)*X + Y.constant(b)*Y; + } + + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ + cible = source; + } + + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ + cible = source; + } +}; + +#endif diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index f6c3b49bb..292f09564 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -30,6 +30,8 @@ * \endcode */ +#include + // Emulate the cxx11 functionality that we need if the compiler doesn't support it. #if __cplusplus <= 199711L #include "src/Core/util/EmulateCXX11Meta.h" -- cgit v1.2.3 From f786897e4b96737767effc85bedb78f06dc46dc5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 17 Oct 2014 15:33:27 -0700 Subject: Added access to the unerlying raw data of a tnsor slice/chip whenever possible --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 9 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 21 ++++++- unsupported/test/cxx11_tensor_chipping.cpp | 37 +++++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 64 +++++++++++++++++++++- 4 files changed, 126 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 3aa3eba24..b862a8fd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -157,7 +157,14 @@ struct TensorEvaluator, Device> }*/ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (DimId == NumDims && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 686bf5c24..3447592eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -366,7 +366,26 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + return result + offset; + } + return NULL; + } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 8c8a0cec2..0027b2888 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -236,9 +236,46 @@ static void test_chip_as_lvalue() } +static void test_chip_raw_data() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; + auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice()); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + int chip_index = i + 2 * (j + 3 * (k + 5 * l)); + VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); + } + } + } + } + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; + auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip0.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip1.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip2.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); +} + + void test_cxx11_tensor_chipping() { CALL_SUBTEST(test_simple_chip()); CALL_SUBTEST(test_chip_in_expr()); CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_raw_data()); } diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index fd1b1fa32..78b0dade0 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,7 +12,6 @@ #include using Eigen::Tensor; -using Eigen::IndexPair; static void test_simple_reshape() { @@ -53,7 +52,8 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -126,7 +126,8 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array, 1> contract_along{{IndexPair(1, 0)}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); @@ -190,6 +191,62 @@ static void test_slice_as_lvalue() } +static void test_slice_raw_data() +{ + Tensor tensor(3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes offsets(1,2,3,4); + Eigen::DSizes extents(1,1,1,1); + typedef TensorEvaluator SliceEvaluator; + auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul); + VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4)); + + extents = Eigen::DSizes(2,1,1,1); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + + extents = Eigen::DSizes(1,2,1,1); + auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice3.data(), static_cast(0)); + + offsets = Eigen::DSizes(0,2,3,4); + extents = Eigen::DSizes(3,2,1,1); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + } + } + + offsets = Eigen::DSizes(0,0,0,4); + extents = Eigen::DSizes(3,5,7,2); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 2; ++l) { + int slice_index = i + 3 * (j + 5 * (k + 7 * l)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + } + } + } + } + + offsets = Eigen::DSizes(0,0,0,0); + extents = Eigen::DSizes(3,5,7,11); + auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3ul*5*7*11); + VERIFY_IS_EQUAL(slice6.data(), tensor.data()); +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); @@ -199,4 +256,5 @@ void test_cxx11_tensor_morphing() CALL_SUBTEST(test_simple_slice()); CALL_SUBTEST(test_slice_in_expr()); CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_raw_data()); } -- cgit v1.2.3 From debc97821c775518afd54e05e19dec9eb0c3bde1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Oct 2014 23:10:13 -0700 Subject: Added support for tensor references --- unsupported/Eigen/CXX11/Tensor | 2 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 360 +++++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 40 +++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_ref.cpp | 192 +++++++++++ 6 files changed, 596 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h create mode 100644 unsupported/test/cxx11_tensor_ref.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 47447f446..c36db96ec 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -76,6 +76,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" + #include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 67f478822..a72e11215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -15,6 +15,7 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; template class TensorMap; +template class TensorRef; template::value> class TensorBase; template class TensorCwiseNullaryOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 000000000..db2027a5f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,360 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +namespace Eigen { + +namespace internal { + +template +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) { } + virtual ~TensorLazyBaseEvaluator() { } + + virtual const Dimensions& dimensions() const = 0; + virtual const Scalar* data() const = 0; + + virtual const Scalar coeff(DenseIndex index) const = 0; + virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assigment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + +static char dummy[8]; + +template +class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator::Scalar> { + public: + // typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Scalar Scalar; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { + m_impl.cleanup(); + } + + virtual const Dimensions& dimensions() const { + return m_dims; + } + virtual const Scalar* data() const { + return m_impl.data(); + } + + virtual const Scalar coeff(DenseIndex index) const { + return m_impl.coeff(index); + } + virtual Scalar& coeffRef(DenseIndex index) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return *reinterpret_cast(dummy); + }; + + protected: + TensorEvaluator m_impl; + Dimensions m_dims; +}; + +template +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly { + public: + typedef TensorLazyEvaluatorReadOnly Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluatorWritable() { + } + + virtual Scalar& coeffRef(DenseIndex index) { + return this->m_impl.coeffRef(index); + } +}; + +template +class TensorLazyEvaluator : public internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type { + public: + typedef typename internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluator() { + } +}; + +} // namespace internal + + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template class TensorRef : public TensorBase > +{ + public: + typedef TensorRef Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { + } + + template + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template + EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { + unrefEvaluator(); + } + + TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator = (const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index index) const + { + return m_evaluator->coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } +#else + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeff(indices); + } +#endif + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(const array& indices) const + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options&RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_evaluator->coeffRef(index); + } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator* m_evaluator; +}; + + +// evaluator for rvalues +template +struct TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) + : m_ref(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_ref.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return m_ref.coeffRef(index); + } + + Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef m_ref; +}; + + +// evaluator for lvalues +template +struct TensorEvaluator, Device> : public TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator, Device> Base; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_ref.coeffRef(index); + } +}; + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 5940a8cf1..5c0f78489 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -84,6 +84,20 @@ struct traits > }; }; +template +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + enum { + Options = BaseTraits::Options, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + template struct eval, Eigen::Dense> @@ -121,6 +135,19 @@ struct eval, Eigen::Dense> typedef const TensorMap& type; }; +template +struct eval, Eigen::Dense> +{ + typedef const TensorRef& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorRef& type; +}; + + template struct nested, 1, typename eval >::type> { @@ -145,6 +172,7 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; + template struct nested, 1, typename eval >::type> { @@ -157,6 +185,18 @@ struct nested, 1, typename eval& type; }; +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorRef& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorRef& type; +}; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a7ef2b402..2b5395013 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -126,5 +126,6 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_ref "-std=c++0x") ei_add_test(cxx11_tensor_io "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp new file mode 100644 index 000000000..4ff94a059 --- /dev/null +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_simple_lvalue_ref() +{ + Tensor input(6); + input.setRandom(); + + TensorRef> ref3(input); + TensorRef> ref4 = input; + + VERIFY_IS_EQUAL(ref3.data(), input.data()); + VERIFY_IS_EQUAL(ref4.data(), input.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input(i)); + VERIFY_IS_EQUAL(ref4(i), input(i)); + } + + for (int i = 0; i < 6; ++i) { + ref3.coeffRef(i) = i; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), i); + } + for (int i = 0; i < 6; ++i) { + ref4.coeffRef(i) = -i * 2; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), -i*2); + } +} + + +static void test_simple_rvalue_ref() +{ + Tensor input1(6); + input1.setRandom(); + Tensor input2(6); + input2.setRandom(); + + TensorRef> ref3(input1 + input2); + TensorRef> ref4 = input1 + input2; + + VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i)); + VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i)); + } +} + + +static void test_multiple_dims() +{ + Tensor input(3,5,7); + input.setRandom(); + + TensorRef> ref(input); + VERIFY_IS_EQUAL(ref.data(), input.data()); + VERIFY_IS_EQUAL(ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_slice() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); + TensorRef> slice = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5)); + + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); + slice = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + + Eigen::DSizes indices3(0,0,0,0,0); + Eigen::DSizes sizes3(2,3,1,1,1); + slice = tensor.slice(indices3, sizes3); + VERIFY_IS_EQUAL(slice.data(), tensor.data()); +} + + +static void test_ref_of_ref() +{ + Tensor input(3,5,7); + input.setRandom(); + + TensorRef> ref(input); + TensorRef> ref_of_ref(ref); + TensorRef> ref_of_ref2; + ref_of_ref2 = ref; + + VERIFY_IS_EQUAL(ref_of_ref.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7); + + VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k)); + VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_ref_in_expr() +{ + Tensor input(3,5,7); + input.setRandom(); + TensorRef> input_ref(input); + + Tensor result(3,5,7); + result.setRandom(); + TensorRef> result_ref(result); + + Tensor bias(3,5,7); + bias.setRandom(); + + result_ref = input_ref + bias; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k)); + VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } + + result = result_ref; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_ref() +{ + CALL_SUBTEST(test_simple_lvalue_ref()); + CALL_SUBTEST(test_simple_rvalue_ref()); + CALL_SUBTEST(test_multiple_dims()); + CALL_SUBTEST(test_slice()); + CALL_SUBTEST(test_ref_of_ref()); + CALL_SUBTEST(test_ref_in_expr()); +} -- cgit v1.2.3 From 5e62427e22002019d1a3ef05daeb75c6db7c6405 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 17:49:39 -0700 Subject: Use the proper index type --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 01fa04c64..4fa8e83ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -149,26 +149,26 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, unsigned int size) { + EigenMetaKernel(Evaluator eval, Index size) { - const int first_index = blockIdx.x * blockDim.x + threadIdx.x; - const int step_size = blockDim.x * gridDim.x; + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { // Use the scalar path - for (int i = first_index; i < size; i += step_size) { + for (Index i = first_index; i < size; i += step_size) { eval.evalScalar(i); } } else { // Use the vector path - const int PacketSize = unpacket_traits::size; - const int vectorized_step_size = step_size * PacketSize; - const int vectorized_size = (size / PacketSize) * PacketSize; - int i = first_index * PacketSize; + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + Index i = first_index * PacketSize; for ( ; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } @@ -193,7 +193,7 @@ class TensorExecutor const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel > <<>>(evaluator, size); + EigenMetaKernel, Index><<>>(evaluator, size); assert(cudaGetLastError() == cudaSuccess); } evaluator.cleanup(); -- cgit v1.2.3 From 1946cc44784c9d0b024a2f1d7d7664010735411f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 17:52:32 -0700 Subject: Added missing packet primitives for CUDA. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 5b0abe2e6..7b481d512 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, c to[stride*1] = from.y; } +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + template<> EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; -- cgit v1.2.3 From bc99c5f7db8d4d7e41e5e4358170e99a1bf9d364 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 18:09:53 -0700 Subject: fixed some potential alignment issues. --- Eigen/src/Core/util/Macros.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8fdd7d898..001907a0b 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -297,7 +297,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) +#if (defined __CUDACC__) +#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif (defined _MSC_VER) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3447592eb..33849ed3e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -459,7 +459,7 @@ struct TensorEvaluator, Device> this->m_impl.template writePacket(inputIndices[0], x); } else { - CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; -- cgit v1.2.3 From d62bfe73a92878c878a6b46674a2ea4cec130ac8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 18:15:05 -0700 Subject: Use the proper index type in the padding code --- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 89c0cff05..d6347b054 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -98,7 +98,6 @@ struct TensorEvaluator, Device for (int i = 0; i < NumDims; ++i) { m_dimensions[i] += m_padding[i].first + m_padding[i].second; } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); m_inputStrides[0] = 1; m_outputStrides[0] = 1; @@ -125,6 +124,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -151,11 +151,11 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const int first = index; - const int last = index + packetSize - 1; - const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const int lastPaddedRight = m_outputStrides[i+1]; + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. @@ -179,9 +179,9 @@ struct TensorEvaluator, Device const Index last = index + packetSize - 1; const Index first = index; - const int lastPaddedLeft = m_padding[0].first; - const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const int lastPaddedRight = m_outputStrides[1]; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. -- cgit v1.2.3 From fcecafde3aac795a50c32dc5c91a0ed59b4819ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 21:58:14 -0700 Subject: Fixed a compilation error with clang --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a753c5a48..1af2d7bcd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -187,13 +187,6 @@ class TensorFixedSize : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) -- cgit v1.2.3 From 85c3389b2845c5bece37dfb155053aef22ea4138 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 00:04:13 -0700 Subject: Fixed a test --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 2 +- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 2dd8e274b..c5965065e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -384,6 +384,9 @@ static const size_t value = Sizes::count; }; template struct array_size > { static const size_t value = Sizes::count; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { + return get::Base>::value; }; #else template struct array_size > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index db2027a5f..d43fb286e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -64,7 +64,7 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator(dummy); }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 2b5395013..49a8013ea 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -100,7 +100,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") -# ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_assign "-std=c++0x") # ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") -- cgit v1.2.3 From 7f2c6ed2fa35d7f83f0da83c8564b7bd5b01d232 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 11:45:21 -0700 Subject: Fixed a compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index c5965065e..3d646c455 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -387,7 +387,7 @@ static const size_t value = Sizes::count; }; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { return get::Base>::value; -}; +} #else template struct array_size > { static const size_t value = Sizes::count; -- cgit v1.2.3 From 2dde63499c4ef836a0d9dfd443494d863ad62b16 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 16:33:51 -0700 Subject: Generalized the matrix vector product code. --- Eigen/src/Core/GeneralProduct.h | 32 +-- Eigen/src/Core/products/GeneralMatrixVector.h | 246 ++++++++++++----------- Eigen/src/Core/products/TriangularMatrixVector.h | 46 +++-- Eigen/src/Core/products/TriangularSolverVector.h | 24 ++- Eigen/src/Core/util/BlasUtil.h | 47 ++++- 5 files changed, 228 insertions(+), 167 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 7179eb124..9d3d5562c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -11,7 +11,7 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H -namespace Eigen { +namespace Eigen { /** \class GeneralProduct * \ingroup Core_Module @@ -257,7 +257,7 @@ class GeneralProduct : public ProductBase, Lhs, Rhs> { template struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; - + public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) @@ -266,7 +266,7 @@ class GeneralProduct EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } - + struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; @@ -277,12 +277,12 @@ class GeneralProduct dst.const_cast_derived() += m_scale * src; } }; - + template inline void evalTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, set(), IsRowMajor()); } - + template inline void addTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, add(), IsRowMajor()); @@ -436,12 +436,12 @@ template<> struct gemv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), evalToDest ? dest.data() : static_dest.data()); - + if(!evalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN @@ -457,11 +457,13 @@ template<> struct gemv_selector MappedDest(actualDestPtr, dest.size()) = dest; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhs.data(), actualRhs.innerStride(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), actualDestPtr, 1, compatibleAlpha); @@ -516,11 +518,13 @@ template<> struct gemv_selector Map(actualRhsPtr, actualRhs.size()) = actualRhs; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhsPtr, 1, + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), dest.data(), dest.innerStride(), actualAlpha); } @@ -594,7 +598,7 @@ MatrixBase::operator*(const MatrixBase &other) const #ifdef EIGEN_DEBUG_PRODUCT internal::product_type::debug(); #endif - + return Product(derived(), other.derived()); } #else diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 340c51394..7dfa48bfb 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,17 +48,17 @@ namespace internal { * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. - * + * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ -template -struct general_matrix_vector_product +template +struct general_matrix_vector_product { -typedef typename scalar_product_traits::ReturnType ResScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; enum { Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable @@ -78,17 +78,17 @@ typedef typename conditional::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { @@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load(j), ptmp0), \ + pcj.pmul(lhs1.template load(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load(j), ptmp2), \ + pcj.pmul(lhs3.template load(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper cj; conj_helper pcj; @@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -178,20 +182,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1(alpha*rhs(i, 0)), + ptmp1 = pset1(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1(alpha*rhs(i+2, 0)), + ptmp3 = pset1(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -199,10 +203,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -211,11 +215,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload(&lhs0[j]); - A10 = pload(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load(j); + A10 = lhs0.template load(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -254,12 +258,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j(alignedStart)) for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); else for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i -struct general_matrix_vector_product +template +struct general_matrix_vector_product { typedef typename scalar_product_traits::ReturnType ResScalar; @@ -346,67 +350,69 @@ typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); - + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } conj_helper cj; conj_helper pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (rhsAlignmentOffset < 0) ) { alignedSize = 0; alignedStart = 0; @@ -418,7 +424,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -447,8 +453,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -481,11 +487,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&rhs[j]); - A11 = pload(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); + ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; jalignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned(alignedStart)) for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); else for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); - + typedef Map, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -184,7 +190,7 @@ struct TriangularProduct template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - + internal::trmv_selector<(int(internal::traits::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); } }; @@ -211,7 +217,7 @@ struct TriangularProduct namespace internal { // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. - + template<> struct trmv_selector { template @@ -247,7 +253,7 @@ template<> struct trmv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -267,7 +273,7 @@ template<> struct trmv_selector else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product struct trmv_selector #endif Map(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product ::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template struct triangular_solve_vector @@ -37,6 +37,10 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; typename internal::conditional,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 25a62d528..c4881b8da 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -34,7 +34,9 @@ template< int ResStorageOrder> struct general_matrix_matrix_product; -template +template struct general_matrix_vector_product; @@ -118,13 +120,35 @@ template struct get_factor::R }; +template +class BlasVectorMapper { + public: + EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template + EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt(m_data + i); + } + + template + bool aligned(Index i) const { + return (size_t(m_data+i)%sizeof(Packet))==0; + } + + protected: + Scalar* m_data; +}; + template -class MatrixLinearMapper { +class BlasLinearMapper { public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -157,7 +181,8 @@ class blas_data_mapper { typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - typedef MatrixLinearMapper LinearMapper; + typedef BlasLinearMapper LinearMapper; + typedef BlasVectorMapper VectorMapper; EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} @@ -170,6 +195,11 @@ class blas_data_mapper { return LinearMapper(&operator()(i, j)); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; @@ -193,6 +223,15 @@ class blas_data_mapper { return pgather(&operator()(i, j), m_stride); } + const Index stride() const { return m_stride; } + + Index firstAligned(Index size) const { + if (size_t(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_aligned(m_data, size); + } + protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; -- cgit v1.2.3 From b1789c112b5cf8d478a03786c6c1243320aefd47 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 3 Nov 2014 08:51:33 -0800 Subject: Improved handling of 1d tensors --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 98 +++++++++++++++++++--- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 12 ++- 2 files changed, 99 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c530b27a7..8e898619d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { @@ -142,6 +142,13 @@ class BaseTensorContractionMapper { return IndexPair(linidx[0], linidx[1]); } + Index firstAligned(Index size) const { + return size; + } + Index stride() const { + return 1; + } + protected: const Tensor m_tensor; const nocontract_t m_nocontract_strides; @@ -202,6 +209,18 @@ class TensorContractionSubMapper { return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); } + template + EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); + return loadPacket(i); + } + + template + bool aligned(Index /*i*/) const { + return false; + } + private: const ParentMapper& m_base_mapper; const Index m_vert_offset; @@ -220,6 +239,7 @@ class TensorContractionInputMapper public: typedef BaseTensorContractionMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -233,6 +253,10 @@ class TensorContractionInputMapper return SubMapper(*this, i, j); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; @@ -306,6 +330,7 @@ class TensorContractionInputMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -319,6 +344,10 @@ class TensorContractionInputMapper::type Packet; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { @@ -592,41 +621,80 @@ struct TensorContractionEvaluatorBase if (this->m_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } else { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } } + template + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -707,7 +775,17 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index cf1352a31..f0e9bb616 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -93,7 +93,17 @@ struct TensorEvaluator - void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; -- cgit v1.2.3 From 9ea09179b5394fdd4af3a8450cdb60d72b232327 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Nov 2014 10:24:42 -0800 Subject: Fixed the return type of the coefficient-wise tensor operations. --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 27 +++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 131326615..f7c784942 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -45,7 +45,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* dest) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { if (dest) { m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); return false; @@ -108,7 +108,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -161,7 +161,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -175,7 +175,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const NullaryOp m_functor; @@ -228,7 +228,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const UnaryOp m_functor; @@ -253,7 +253,9 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const BinaryOp m_functor; @@ -313,7 +315,10 @@ struct TensorEvaluator : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) - { } + { + eigen_assert(internal::dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(internal::dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -327,7 +332,7 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -356,7 +361,7 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_condImpl; -- cgit v1.2.3 From 9a06a716277029ffa152049be8fd53aee1e1bc13 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 5 Nov 2014 07:49:51 -0800 Subject: Fixed a test --- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_dimension.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 49a8013ea..e83c10dc4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -101,7 +101,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") -# ei_add_test(cxx11_tensor_dimension "-std=c++0x") + ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index fc0d29c50..c806b623f 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -16,7 +16,7 @@ using Eigen::Tensor; static void test_dynamic_size() { - Eigen::DSizes dimensions(Eigen::array(2,3,7)); + Eigen::DSizes dimensions(Eigen::array{{2,3,7}}); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); @@ -37,7 +37,7 @@ static void test_fixed_size() static void test_match() { - Eigen::DSizes dyn(Eigen::array(2,3,7)); + Eigen::DSizes dyn(Eigen::array{{2,3,7}}); Eigen::Sizes<2,3,7> stat; VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); } -- cgit v1.2.3 From cb37f818ca6e8dfc9d81343882401e3671531d1b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 5 Nov 2014 23:25:11 -0800 Subject: Fixed a compilation error triggered by some operations on fixed sized tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 12 ++++-------- unsupported/test/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3d646c455..6d9e09318 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -40,10 +40,6 @@ template struct IndexPair { // Boilerplate code namespace internal { -template struct dget { - static const std::size_t value = get::value; -}; - template struct fixed_size_tensor_index_linearization_helper @@ -53,7 +49,7 @@ struct fixed_size_tensor_index_linearization_helper const Dimensions& dimensions) { return array_get(indices) + - dget::value * + get::value * fixed_size_tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -125,7 +121,7 @@ struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes { +template struct Sizes : typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; static const size_t count = Base::count; static const std::size_t total_size = internal::arg_prod::value; @@ -164,11 +160,11 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e83c10dc4..6b8ed2826 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -107,7 +107,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") -# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_complex "-std=c++0x") -- cgit v1.2.3 From c2d1074932ae92a001eadb27e9f85eaf2de187b9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Nov 2014 22:25:38 -0800 Subject: Added support for static list of indices --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 264 +++++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_index_list.cpp | 133 +++++++++++ 4 files changed, 399 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h create mode 100644 unsupported/test/cxx11_tensor_index_list.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c36db96ec..44d5a4d82 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -43,6 +43,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 000000000..010221e74 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,264 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +#if __cplusplus > 199711L + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static const DenseIndex value = n; + constexpr operator DenseIndex() const { return n; } + void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +namespace internal { +template +void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template +void update_value(type2index& val, DenseIndex new_val) { + val.set(new_val); +} + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + +template +struct tuple_coeff { + template + static constexpr DenseIndex get(const DenseIndex i, const std::tuple& t) { + return std::get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + } + template + static void set(const DenseIndex i, std::tuple& t, const DenseIndex value) { + if (i == Idx) { + update_value(std::get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + return ((i == Idx) & is_compile_time_constant >::type>::value) || + tuple_coeff::value_known_statically(i, t); + } +}; + +template <> +struct tuple_coeff<0> { + template + static constexpr DenseIndex get(const DenseIndex i, const std::tuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return std::get<0>(t) * (i == 0); + } + template + static void set(const DenseIndex i, std::tuple& t, const DenseIndex value) { + eigen_assert (i == 0); + update_value(std::get<0>(t), value); + } + template + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return is_compile_time_constant >::type>::value & (i == 0); + } +}; +} // namespace internal + + +template +struct IndexList : std::tuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff >::value-1>::set(i, *this, value); + } + + constexpr IndexList(const std::tuple& other) : std::tuple(other) { } + constexpr IndexList() : std::tuple() { } + + constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1>::value_known_statically(i, *this); + } +}; + + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return std::make_tuple(val1, other_vals...); +} + + +namespace internal { + +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; + +template constexpr DenseIndex array_get(IndexList& a) { + return std::get(a); +} +template constexpr DenseIndex array_get(const IndexList& a) { + return std::get(a); +} + +template +struct index_known_statically { + constexpr bool operator() (DenseIndex) const { + return false; + } +}; + +template +struct index_known_statically > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_statically_eq { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_eq > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] == value; + } +}; + +template +struct index_statically_eq > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] == value; + } +}; + +template +struct index_statically_ne { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_ne > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] != value; + } +}; + +template +struct index_statically_ne > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] != value; + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +// No C++11 support +template +struct index_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_eq { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_ne { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 6b8ed2826..181f06fc7 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,6 +102,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_dimension "-std=c++0x") + ei_add_test(cxx11_tensor_index_list "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp new file mode 100644 index 000000000..6a103cab1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -0,0 +1,133 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_static_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + constexpr auto reduction_axis = make_index_list(0, 1, 2); + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 2); + + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + + Tensor result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + + +static void test_dynamic_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + int dim1 = 2; + int dim2 = 1; + int dim3 = 0; + + auto reduction_axis = make_index_list(dim1, dim2, dim3); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 0); + + Tensor result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + +static void test_mixed_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + int dim2 = 1; + int dim4 = 3; + + auto reduction_axis = make_index_list(0, dim2, 2, dim4); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[3]), 3); + + typedef IndexList, int, type2index<2>, int> ReductionIndices; + ReductionIndices reduction_indices; + reduction_indices.set(1, 1); + reduction_indices.set(3, 3); + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + + Tensor result1 = tensor.sum(reduction_axis); + Tensor result2 = tensor.sum(reduction_indices); + + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + expected += tensor(i,j,k,l); + } + } + } + } + VERIFY_IS_APPROX(result1(0), expected); + VERIFY_IS_APPROX(result2(0), expected); +} + + +void test_cxx11_tensor_index_list() +{ + CALL_SUBTEST(test_static_index_list()); + CALL_SUBTEST(test_dynamic_index_list()); + CALL_SUBTEST(test_mixed_index_list()); +} -- cgit v1.2.3 From eeabf7975e59b47f4e3677c340013ebbfcfbc2bd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Nov 2014 22:35:44 -0800 Subject: Optimized broadcasting --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 36 +++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 2bd158dac..a77903dca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -24,11 +24,13 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; }; template @@ -85,6 +87,7 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; enum { IsAligned = false, @@ -129,10 +132,19 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } index -= idx * m_outputStrides[i]; } - inputIndex += (index % m_impl.dimensions()[0]); + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } return m_impl.coeff(inputIndex); } @@ -150,10 +162,20 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } index -= idx * m_outputStrides[i]; } - const Index innermostLoc = index % m_impl.dimensions()[0]; + Index innermostLoc; + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } inputIndex += innermostLoc; // Todo: this could be extended to the second dimension if we're not -- cgit v1.2.3 From ec785b0180f6cf9355b89d85c53fa18acf83e8a6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 13 Nov 2014 09:28:54 -0800 Subject: Added support for extraction of patches from images --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 13 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 291 +++++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_image_patch.cpp | 280 ++++++++++++++++++++ 6 files changed, 587 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h create mode 100644 unsupported/test/cxx11_tensor_image_patch.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 44d5a4d82..aa26e5283 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -59,6 +59,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 6018ecc66..f451a3c99 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -255,6 +255,19 @@ class TensorBase return TensorPatchOp(derived(), patch_dims); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches() const { + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride = 1, const Index col_stride = 1) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index a72e11215..85599ccfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -27,6 +27,7 @@ template class Tenso template class TensorContractionOp; template class TensorConvolutionOp; template class TensorPatchOp; +template class TensorImagePatchOp; template class TensorBroadcastingOp; template class TensorChippingOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 000000000..ce916fdfd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,291 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorImagePatchOp type; +}; + +} // end namespace internal + + + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides){} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = ceilf(static_cast(input_dims[1]) / op.row_strides()) * + ceilf(static_cast(input_dims[2]) / op.col_strides()); + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + + m_rowInputStride = input_dims[0] * op.row_strides(); + m_colInputStride = input_dims[0] * input_dims[1] * op.col_strides(); + m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2]; + + m_rowPaddingTop = op.patch_rows() / 2; + m_colPaddingLeft = op.patch_cols() / 2; + + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInputRows = internal::TensorIntDivisor(m_inputRows); + m_fastDimZero = internal::TensorIntDivisor(m_dimensions[0]); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Find the location of the first element of the patch. + const Index patchIndex = index / m_fastPatchStride; + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero; + + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colOffset = patchOffset / m_fastColStride; + + const Index inputCol = colIndex + colOffset - m_colPaddingLeft; + if (inputCol < 0 || inputCol >= m_inputCols) { + return Scalar(0); + } + const Index rowIndex = patch2DIndex - colIndex * m_inputRows; // m_rowStride is always 1 + const Index rowOffset = patchOffset - colOffset * m_colStride; + + const Index inputRow = rowIndex + rowOffset - m_rowPaddingTop; + if (inputRow < 0 || inputRow >= m_inputRows) { + return Scalar(0); + } + + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index indices[2] = {index, index + packetSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero, + (indices[1] - patchIndex * m_patchStride) / m_fastDimZero}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + const Index inputCols[2] = {colIndex + colOffsets[0] - m_colPaddingLeft, colIndex + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_inputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0] - m_rowPaddingTop, rowIndex + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputRows; + Index m_inputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastInputRows; + internal::TensorIntDivisor m_fastDimZero; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 181f06fc7..89c651804 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -122,6 +122,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") ei_add_test(cxx11_tensor_patch "-std=c++0x") + ei_add_test(cxx11_tensor_image_patch "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp new file mode 100644 index 000000000..55d35eac0 --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -0,0 +1,280 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + Tensor single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j, b); + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { + expected = tensor(d, r-1+i, c-1+j, b); + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + + +static void test_patch_no_extra_dim() +{ + Tensor tensor(2,3,5); + tensor.setRandom(); + + Tensor single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j); + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected); + } + } + } + } + } + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { + expected = tensor(d, r-1+i, c-1+j); + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected); + } + } + } + } + } +} + + +static void test_imagenet_patches() +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + Tensor l_in(3, 128, 128, 128); + l_in.setRandom(); + Tensor l_out = l_in.extract_image_patches(11, 11); + VERIFY_IS_EQUAL(l_out.dimension(0), 3); + VERIFY_IS_EQUAL(l_out.dimension(1), 11); + VERIFY_IS_EQUAL(l_out.dimension(2), 11); + VERIFY_IS_EQUAL(l_out.dimension(3), 128*128); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 128; ++i) { + for (int j = 0; j < 128; ++j) { + int patchId = i+128*j; + for (int c = 0; c < 11; ++c) { + for (int r = 0; r < 11; ++r) { + for (int d = 0; d < 3; ++d) { + float expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in(d, r-5+i, c-5+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(64, 64, 64, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(9, 9); + VERIFY_IS_EQUAL(l_out.dimension(0), 64); + VERIFY_IS_EQUAL(l_out.dimension(1), 9); + VERIFY_IS_EQUAL(l_out.dimension(2), 9); + VERIFY_IS_EQUAL(l_out.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + int patchId = i+64*j; + for (int c = 0; c < 9; ++c) { + for (int r = 0; r < 9; ++r) { + for (int d = 0; d < 64; ++d) { + float expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in(d, r-4+i, c-4+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(128, 16, 16, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(7, 7); + VERIFY_IS_EQUAL(l_out.dimension(0), 128); + VERIFY_IS_EQUAL(l_out.dimension(1), 7); + VERIFY_IS_EQUAL(l_out.dimension(2), 7); + VERIFY_IS_EQUAL(l_out.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 16; ++j) { + int patchId = i+16*j; + for (int c = 0; c < 7; ++c) { + for (int r = 0; r < 7; ++r) { + for (int d = 0; d < 128; ++d) { + float expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in(d, r-3+i, c-3+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(384, 13, 13, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(3, 3); + VERIFY_IS_EQUAL(l_out.dimension(0), 384); + VERIFY_IS_EQUAL(l_out.dimension(1), 3); + VERIFY_IS_EQUAL(l_out.dimension(2), 3); + VERIFY_IS_EQUAL(l_out.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 13; ++i) { + for (int j = 0; j < 13; ++j) { + int patchId = i+13*j; + for (int c = 0; c < 3; ++c) { + for (int r = 0; r < 3; ++r) { + for (int d = 0; d < 384; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in(d, r-1+i, c-1+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + + +void test_cxx11_tensor_image_patch() +{ + CALL_SUBTEST(test_simple_patch()); + CALL_SUBTEST(test_patch_no_extra_dim()); + CALL_SUBTEST(test_imagenet_patches()); +} -- cgit v1.2.3 From 1d3c8306f87b284c26180be6eac13dc8d4aa1b52 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 13 Nov 2014 19:13:17 -0800 Subject: Fixed compilation errors with clang. H: Enter commit message. Lines beginning with 'HG:' are removed. --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 10 +++++----- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a77903dca..8cb41aec8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -30,7 +30,6 @@ struct traits > : public traits::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8e898619d..c5ec42cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -848,8 +848,8 @@ struct TensorEvaluator(this->m_device.allocate(sizeA * sizeof(LhsScalar))); RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index ce916fdfd..0dfb6913b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -37,7 +37,6 @@ struct traits > : public traits typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 010221e74..eaf0195ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -110,7 +110,7 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } @@ -190,7 +190,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -198,7 +198,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -213,7 +213,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -221,7 +221,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; -- cgit v1.2.3 From b33cf92878a57ec86d5e5715e7cde3a0cd360fd6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 18 Nov 2014 14:32:41 -0800 Subject: Fixed the evaluation of expressions involving tensors of 2 or 3 elements on CUDA devices. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4fa8e83ef..f27f643c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -168,11 +168,10 @@ __launch_bounds__(1024) const Index PacketSize = unpacket_traits::size; const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize; - Index i = first_index * PacketSize; - for ( ; i < vectorized_size; i += vectorized_step_size) { + for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for ( ; i < size; i += step_size) { + for (Index i = vectorized_size + first_index; i < size; i += step_size) { eval.evalScalar(i); } } -- cgit v1.2.3 From 509e4ddc02e0d70b8c1ee325f3b18624d4235c1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Nov 2014 10:34:11 -0800 Subject: Added reduction packet primitives for CUDA --- Eigen/src/Core/arch/CUDA/PacketMath.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 7b481d512..19749c832 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -223,6 +223,27 @@ template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { return a.x; } +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); } -- cgit v1.2.3 From 9f98650d0a82d4757afb4503ce6f2b6f61763463 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 6 Jan 2015 09:29:13 -0800 Subject: Ensured that contractions that can be reduced to a matrix vector product work correctly even when the input coefficients aren't aligned. --- Eigen/src/Core/products/GeneralMatrixVector.h | 8 +++-- unsupported/test/cxx11_tensor_contraction.cpp | 48 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 7dfa48bfb..7df6a6b1a 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -140,10 +140,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { @@ -412,10 +413,13 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 2b599d30d..17bd335f7 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -352,6 +352,52 @@ static void test_large_contraction() } +static void test_matrix_vector() +{ + Tensor t_left(30, 50); + Tensor t_right(50); + Tensor t_result(30); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 30, 50); + MapXf m_right(t_right.data(), 50, 1); + Eigen::Matrix m_result(30, 1); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims{{DimPair(1, 0)}}; + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + +static void test_tensor_vector() +{ + Tensor t_left(7, 13, 17); + Tensor t_right(1, 7); + typedef typename Tensor::DimensionPair DimensionPair; + Eigen::array dim_pair01{{{0, 1}}}; + Tensor t_result = t_left.contract(t_right, dim_pair01); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 7, 13*17); + MapXf m_right(t_right.data(), 1, 7); + Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); @@ -364,4 +410,6 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_out_of_order_contraction()); CALL_SUBTEST(test_consistency()); CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_tensor_vector()); } -- cgit v1.2.3 From 91dd53e54db5c85c37e05bce5af95d31ba337e34 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 13 Jan 2015 16:07:51 -0800 Subject: Created some documentation --- unsupported/Eigen/CXX11/src/Tensor/README.md | 1446 ++++++++++++++++++++++++++ 1 file changed, 1446 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/README.md diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md new file mode 100644 index 000000000..6a4d52cc3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -0,0 +1,1446 @@ +# Eigen Tensors + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace ```::Eigen.``` + + +### Class Tensor<data_type, rank> + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor Tensor<data_type, rank>(size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed ```rank``` integers +indicating the sizes of the instance along each of the the ```rank``` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor(3, 4, 3); + +#### Constructor Tensor<data_type, rank>(size_array) + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +```Eigen::array<Eigen::Index>```. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor t_2d({5, 7}); + + +### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize> t_4x3; + +### Class TensorMap<Tensor<data_type, rank>> + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + Tensor> t_4x3; + TensorMap t_12(t_4x3, 12); + + +#### Class TensorRef + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### <data_type> tensor(index0, index1...) + +Return the element at position ```(index0, index1...)``` in tensor +```tensor```. You must pass as many parameters as the rank of ```tensor```. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: ```ColMajor``` (the default) and +```RowMajor```. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor col_major; // equivalent to Tensor + TensorMap > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +```swap_layout()``` method. Note that this will also reverse the order of the +dimensions. + + Tensor col_major(2, 4); + Tensor row_major(2, 4); + + Tensor col_major_result = col_major; // ok, layouts match + Tensor col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor t1(2, 3, 4); + ...set some values in t1... + Tensor t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression ```t1 + t2``` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors +```t1``` and ```t2```. This is a small C++ object that knows how to add +```t1``` and ```t2```. It is only when the value of the expression is assigned +to the tensor ```t3``` that the addition is actually performed. Technically, +this happens through the overloading of ```operator=()``` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression ```t1 + t2 * +0.3f``` is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ ```auto``` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor ```t3``` and it will contain the +result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` +is actually the tree of tensor operators that will compute the addition of +```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get +the values of its elements: + + Tensor t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use ```auto``` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use ```auto``` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap result(, , ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the ```auto``` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor ```result``` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling eval(). + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the ```eval()``` method of the +expression Operation. + + // The previous example could have been written: + Tensor result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling ```eval()``` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize tmp = t1 + t2; + Tensor result = (tmp * 0.2f).exp(); + +Note that the return value of ```eval()``` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor result = t4; + +While in the examples above calling ```eval()``` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the ```broadcast()``` expression causes the ```X.maximum()``` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to ```eval()``` between the ```maximum()``` and +```reshape()``` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor ```Y``` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` +expressions ensures that the sum is computed before any updates to ```Y``` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of ```Y``` +then you would need to force an eval for correctness by adding an ```eval()``` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a TensorRef. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the ```()``` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the ```device()``` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor a(30, 40); + Tensor b(30, 40); + Tensor c = a + b; + +To choose a different implementation you have to insert a ```device()``` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor c(30, 40); + c.device(...) = a + b; + +The call to ```device()``` must be the last call on the left of the operator=. + +You must pass to the ```device()``` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a ```device()``` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### <Tensor-Type>::Dimensions + +Acts like an array of ints. Has an ```int size``` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See ```dimensions()```. + +#### <Tensor-Type>::Index + +Acts like an ```int```. Used for indexing tensors along their dimensions. See +```operator()```, ```dimension()```, and ```size()```. + +#### <Tensor-Type>::Scalar + +Represents the datatype of individual tensor elements. For example, for a +```Tensor```, ```Scalar``` is the type ```float```. See +```setConstant()```. + +#### <Operation> + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### int NumDimensions + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### Dimensions dimensions() + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the dimensions() result is ::Dimensions. + + Eigen::Tensor a(3, 4); + const Eigen::Tensor::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use ```auto``` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### Index dimension(Index n) + +Returns the n-th dimension of the tensor. The actual type of the +```dimension()``` result is ```::Index```, but you can +always use it like an int. + + Eigen::Tensor a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### Index size() + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the ```size()``` result is +```::Index```, but you can always use it like an int. + + Eigen::Tensor a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide ```dimensions()``` directly, +e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides ```dimensions()``` and ```dimension()``` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors and Copies + +TODO. + + Tensor(...) + TensorFixedSize(...) + TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) + TensorMap(PointerArgType dataPtr, const array& dimensions) + TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + Self& operator=(const Self& other) + Self& operator=(const OtherDerived& other) + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### <Tensor-Type> setConstant(const Scalar& val) + +Sets all elements of the tensor to the constant value ```val```. ```Scalar``` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that ```setConstant()``` can be used on any tensor where the element type +has a copy constructor and an ```operator=()```: + + Eigen::Tensor a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### <Tensor-Type> setZero() + +Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### <Tensor-Type> setValues({..initializer_list}) + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a ```TensorFixedSize``` the initializer list must +contains 2 lists of 3 floats each. + +```setValues()``` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### <Tensor-Type> setRandom() + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize ```setRandom()``` by providing your own random number +generator as a template argument: + + a.setRandom(); + +Here, ```MyRandomGenerator``` must be a struct with the following member +functions, where Scalar and Index are the same as ```::Scalar``` +and ```::Index```. + +See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +TODO + + const Scalar& operator()(const array& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator[](Index index) + ??? mention coeff() and coeffRef() ??? + +### Scalar* data() +### const Scalar* data() const + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor ```Operations```. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### <Operation> constant(const Scalar& val) + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value ```val```. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.constant(2.0f); + Eigen::Tensor c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### <Operation> random() + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for ```setRandom()```. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### <Operation> operator-() + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### <Operation> sqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### <Operation> rsqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### <Operation> square() + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### <Operation> inverse() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### <Operation> exp() + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### <Operation> log() + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### <Operation> abs() + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### <Operation> pow(Scalar exponent) + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor b = a.cast().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### <Operation> operator * (Scalar scale) +TODO + +### <Operation> cwiseMax(Scalar threshold) +TODO + +### <Operation> cwiseMin(Scalar threshold) +TODO + + ### <Operation> unaryExpr(const CustomUnaryOp& func) +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### <Operation> operator+(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### <Operation> operator-(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### <Operation> operator*(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### <Operation> operator/(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### <Operation> cwiseMax(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### <Operation> cwiseMin(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### <Operation> Logical operators + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) + +* operator||(const OtherDerived& other) + +* operator<(const OtherDerived& other) + +* operator<=(const OtherDerived& other) + +* operator>(const OtherDerived& other) + +* operator>=(const OtherDerived& other) + +* operator==(const OtherDerived& other) + +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor if = ...; + Tensor then = ...; + Tensor else = ...; + Tensor result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contractions + +TODO + contract(const OtherDerived& other, const Dimensions& dims) + + + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as ```maximum()``` and ```sum()``` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +```::Dimensions``` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 3 dimensions: 2, 3, 4 + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor b = + a.maximum(Eigen::array({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +one-dimension tensor with a single value. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### <Operation> sum(const Dimensions& new_dims) +### <Operation> sum() + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### <Operation> mean(const Dimensions& new_dims) +### <Operation> mean() + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### <Operation> maximum(const Dimensions& new_dims) +### <Operation> maximum() + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### <Operation> minimum(const Dimensions& new_dims) +### <Operation> minimum() + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### <Operation> prod(const Dimensions& new_dims) +### <Operation> prod() + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) + +Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Convolutions + +TBD: convolve(const KernelDerived& kernel, const Dimensions& dims) + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### <Operation> reshape(const Dimensions& new_dims) + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor input(7, 11); + array three_dims{{7, 11, 1}}; + Tensor result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array one_dim{{7 * 11}}; + Tensor result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you ```reshape()``` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array two_dim({2, 3}); + Eigen::Tensor b; + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### <Operation> shuffle(const Shuffle& shuffle) + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### <Operation> stride(const Strides& strides) + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you ```stride()``` a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array strides({3, 2}); + Eigen::Tensor b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### <Operation> slice(const StartIndices& startIndices, + const Sizes& sizes) + +TBD + + +### <Operation> chip(const Index offset, const Index dim) + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor row_3 = a.chip(2, 0); + Eigen::Tensor col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### <Operation> reverse(const ReverseDimensions& reverse) + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you ```reverse()``` the first dimension +of a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array reverse({true, false}); + Eigen::Tensor b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +TODO +### <Operation> broadcast(const Broadcast& broadcast) + +TODO + +### <Operation> concatenate(const OtherDerived& other, Axis axis) + +TODO + +### <Operation> pad(const PaddingDimensions& padding) + +TODO + +### <Operation> extract_patches(const PatchDims& patch_dims) + +TODO + +### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) + +TODO + + +## Special Operations + +### <Operation> cast<T>() + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor a(2, 3); + Eigen::Tensor b = a.cast(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor b = + (a.cast() / a.constant(2).cast()).cast(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### <Operation> eval() + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* TensorVarDims are only partially supported +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + -- cgit v1.2.3 From c94174b4fe76636ae5f027ad8e59023cd154d90d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:13:08 -0800 Subject: Improved tensor references --- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 73 +++++++++++++++++++++++++- unsupported/test/cxx11_tensor_ref.cpp | 16 ++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index d43fb286e..0a87e67eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -64,7 +64,7 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator(dummy); }; @@ -137,6 +137,8 @@ template class TensorRef : public TensorBase class TensorRef : public TensorBasedimensions().size(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } EIGEN_DEVICE_FUNC @@ -197,6 +201,13 @@ template class TensorRef : public TensorBase indices{{firstIndex, otherIndices...}}; return coeff(indices); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } #else EIGEN_DEVICE_FUNC @@ -237,6 +248,44 @@ template class TensorRef : public TensorBase indices; + indices[0] = i0; + indices[1] = i1; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeffRef(indices); + } #endif template EIGEN_DEVICE_FUNC @@ -244,7 +293,7 @@ template class TensorRef : public TensorBasedimensions(); Index index = 0; - if (PlainObjectType::Options&RowMajor) { + if (PlainObjectType::Options & RowMajor) { index += indices[0]; for (int i = 1; i < NumIndices; ++i) { index = index * dims[i] + indices[i]; @@ -257,6 +306,24 @@ template class TensorRef : public TensorBasecoeff(index); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const @@ -298,6 +365,8 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = false, + Layout = TensorRef::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index 4ff94a059..aa369f278 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -181,6 +181,21 @@ static void test_ref_in_expr() } +static void test_coeff_ref() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + Tensor original = tensor; + + TensorRef> slice = tensor.chip(7, 4); + slice.coeffRef(0, 0, 0, 0) = 1.0f; + slice.coeffRef(1, 0, 0, 0) += 2.0f; + + VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f); + VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f); +} + + void test_cxx11_tensor_ref() { CALL_SUBTEST(test_simple_lvalue_ref()); @@ -189,4 +204,5 @@ void test_cxx11_tensor_ref() CALL_SUBTEST(test_slice()); CALL_SUBTEST(test_ref_of_ref()); CALL_SUBTEST(test_ref_in_expr()); + CALL_SUBTEST(test_coeff_ref()); } -- cgit v1.2.3 From b00fe1590dd72d51ac3e44c42102caac10a54c28 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:14:46 -0800 Subject: Added ability to swap the layout of a tensor --- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 198 +++++++++++++++++++++ unsupported/test/cxx11_tensor_layout_swap.cpp | 61 +++++++ 2 files changed, 259 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h create mode 100644 unsupported/test/cxx11_tensor_layout_swap.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 000000000..7e448f7c0 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + + + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp new file mode 100644 index 000000000..ae297a9da --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap.cpp @@ -0,0 +1,61 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_swap() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +static void test_swap_as_lvalue() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2(7,3,2); + tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +void test_cxx11_tensor_layout_swap() +{ + CALL_SUBTEST(test_simple_swap()); + CALL_SUBTEST(test_swap_as_lvalue()); +} -- cgit v1.2.3 From 4928ea121250fba0979933463624b1edf9863672 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:15:58 -0800 Subject: Added ability to reverse the order of the coefficients in a tensor --- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 207 +++++++++++++++++++++ unsupported/test/cxx11_tensor_reverse.cpp | 167 +++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h create mode 100644 unsupported/test/cxx11_tensor_reverse.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 000000000..439cf3230 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,207 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReverseOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorReverseOp type; +}; + +} // end namespace internal + + + + +template +class TensorReverseOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind + StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, + const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC + const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()) + { + // Compute strides + m_dimensions = m_impl.dimensions(); + if (Layout == ColMajor) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; + } + } else { + m_strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[NumDims-1]) { + inputIndex += (m_dimensions[NumDims-1] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type + values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + ReverseDimensions m_reverse; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp new file mode 100644 index 000000000..4c0be35da --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::array; + +template +static void test_simple_reverse() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + Tensor reversed_tensor; + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } +} + + +template +static void test_expr_reverse() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + + Tensor expected; + expected = tensor.reverse(dim_rev); + + Tensor result(2,3,5,7); + + array src_slice_dim{{2,3,1,7}}; + array src_slice_start{{0,0,0,0}}; + array dst_slice_dim{{2,3,1,7}}; + array dst_slice_start{{0,0,0,0}}; + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 3); + VERIFY_IS_EQUAL(result.dimension(2), 5); + VERIFY_IS_EQUAL(result.dimension(3), 7); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[2] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_reverse() +{ + CALL_SUBTEST(test_simple_reverse()); + CALL_SUBTEST(test_simple_reverse()); + CALL_SUBTEST(test_expr_reverse()); + CALL_SUBTEST(test_expr_reverse()); +} -- cgit v1.2.3 From 3bd2b41b2e074f9feb31bad7c3bf9769368b5d1a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:17:02 -0800 Subject: Created a test for tensor type casting --- unsupported/test/cxx11_tensor_casts.cpp | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_casts.cpp diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp new file mode 100644 index 000000000..4f7ff7067 --- /dev/null +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::array; + +static void test_simple_cast() +{ + Tensor ftensor(20,30); + ftensor.setRandom(); + Tensor chartensor(20,30); + chartensor.setRandom(); + Tensor, 2> cplextensor(20,30); + cplextensor.setRandom(); + + chartensor = ftensor.cast(); + cplextensor = ftensor.cast>(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(chartensor(i,j), static_cast(ftensor(i,j))); + VERIFY_IS_EQUAL(cplextensor(i,j), static_cast>(ftensor(i,j))); + } + } +} + + +void test_cxx11_tensor_casts() +{ + CALL_SUBTEST(test_simple_cast()); +} -- cgit v1.2.3 From 8f4b8d204bd5f9bf3693b162b799397fa899220e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:19:33 -0800 Subject: Improved the performance of tensor reductions Added the ability to generate random numbers following a normal distribution Created a test to validate the ability to generate random numbers. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 245 ++++++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 216 ++++++++++++++---- unsupported/test/cxx11_tensor_random.cpp | 78 +++++++ 3 files changed, 473 insertions(+), 66 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_random.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index e9aa22183..7b8d34321 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -16,50 +16,157 @@ namespace internal { // Standard reduction functors template struct SumReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SumReducer() : m_sum(0) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - m_sum += t; + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) += t; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_sum; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); } - private: - typename internal::remove_all::type m_sum; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return saccum + predux(vaccum); + } +}; + +template struct MeanReducer +{ + static const bool PacketAccess = true; + MeanReducer() : count_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + (*accum) += t; + count_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + count_ += packet_traits::size; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / count_; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / count_; + } + + protected: + int count_; }; template struct MaxReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max(-(std::numeric_limits::max)()) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - if (t > m_max) { m_max = t; } + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_max; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax(*accum, p); } - private: - typename internal::remove_all::type m_max; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return -(std::numeric_limits::max)(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(-(std::numeric_limits::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (std::max)(saccum, predux_max(vaccum)); + } }; template struct MinReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MinReducer() : m_min((std::numeric_limits::max)()) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - if (t < m_min) { m_min = t; } + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_min; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin(*accum, p); } - private: - typename internal::remove_all::type m_min; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return (std::numeric_limits::max)(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1((std::numeric_limits::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (std::min)(saccum, predux_min(vaccum)); + } }; +template struct ProdReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) *= t; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(1); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return saccum * predux_mul(vaccum); + } +}; + #if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) // We're not compiling a cuda kernel template struct UniformRandomGenerator { + + static const bool PacketAccess = true; + template T operator()(Index, Index = 0) const { return random(); @@ -81,16 +188,19 @@ template struct UniformRandomGenerator { template struct UniformRandomGenerator; template <> struct UniformRandomGenerator { - UniformRandomGenerator() { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { const int tid = blockIdx.x * blockDim.x + threadIdx.x; curand_init(0, tid, 0, &m_state); } - template + template EIGEN_DEVICE_FUNC float operator()(Index, Index = 0) const { return curand_uniform(&m_state); } - template + template EIGEN_DEVICE_FUNC float4 packetOp(Index, Index = 0) const { return curand_uniform4(&m_state); } @@ -100,15 +210,18 @@ template <> struct UniformRandomGenerator { }; template <> struct UniformRandomGenerator { - UniformRandomGenerator() { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { const int tid = blockIdx.x * blockDim.x + threadIdx.x; curand_init(0, tid, 0, &m_state); } - template + template EIGEN_DEVICE_FUNC double operator()(Index, Index = 0) const { return curand_uniform_double(&m_state); } - template + template EIGEN_DEVICE_FUNC double2 packetOp(Index, Index = 0) const { return curand_uniform2_double(&m_state); } @@ -120,6 +233,84 @@ template <> struct UniformRandomGenerator { #endif +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 +// We're not compiling a cuda kernel +template struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + NormalRandomGenerator() : m_distribution(0, 1) {} + NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { } + + template + T operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template + typename internal::packet_traits::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_distribution(m_generator); + } + return internal::pload::type>(values); + } + + mutable std::normal_distribution m_distribution; + mutable std::default_random_engine m_generator; +}; + +#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) + +// We're compiling a cuda kernel +template struct NormalRandomGenerator; + +template <> struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template EIGEN_DEVICE_FUNC + float operator()(Index, Index = 0) const { + return curand_normal(&m_state); + } + template EIGEN_DEVICE_FUNC + float4 packetOp(Index, Index = 0) const { + return curand_normal4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template EIGEN_DEVICE_FUNC + double operator()(Index, Index = 0) const { + return curand_normal_double(&m_state); + } + template EIGEN_DEVICE_FUNC + double2 packetOp(Index, Index = 0) const { + return curand_normal2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index cbe87394b..eebcc4850 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -43,6 +43,75 @@ struct nested, 1, typename eval type; }; + +template +struct are_inner_most_dims { + static const bool value = false; +}; +#if __cplusplus > 199711L +template +struct are_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_eq()(0, 0) && + index_statically_eq()(array_size::value-1, array_size::value-1); +}; +template +struct are_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_eq()(0, NumTensorDims - array_size::value) && + index_statically_eq()(array_size::value - 1, NumTensorDims - 1); +}; +#endif + + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizePacket(accum, p); + } +}; + } // end namespace internal @@ -52,8 +121,8 @@ class TensorReductionOp : public TensorBase typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -85,20 +154,27 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumInputDims = internal::array_size::Dimensions>::value; static const int NumReducedDims = internal::array_size::value; - static const int NumDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; - typedef DSizes Dimensions; + static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; + typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef TensorEvaluator, Device> Self; + static const bool InputPacketAccess = TensorEvaluator::PacketAccess; enum { IsAligned = false, - PacketAccess = false, // The code isn't vectorized properly yet + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()) { EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Bitmap indicating if an input dimension is reduced or not. array reduced; for (int i = 0; i < NumInputDims; ++i) { reduced[i] = false; @@ -122,24 +198,41 @@ struct TensorEvaluator, Device> } } - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + // Precompute output strides. + if (Layout == ColMajor) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } } - array strides; - strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - strides[i] = strides[i-1] * input_dims[i-1]; + // Precompute input strides. + array input_strides; + if (Layout == ColMajor) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } } + outputIndex = 0; reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { if (reduced[i]) { - m_reducedStrides[reduceIndex] = strides[i]; + m_reducedStrides[reduceIndex] = input_strides[i]; ++reduceIndex; } else { - m_preservedStrides[outputIndex] = strides[i]; + m_preservedStrides[outputIndex] = input_strides[i]; ++outputIndex; } } @@ -147,6 +240,7 @@ struct TensorEvaluator, Device> // Special case for full reductions if (NumInputDims == NumReducedDims) { m_dimensions[0] = 1; + m_preservedStrides[0] = internal::array_prod(input_dims); } } @@ -161,14 +255,22 @@ struct TensorEvaluator, Device> m_impl.cleanup(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Op reducer(m_reducer); - reduce(firstInput(index), 0, reducer); - return reducer.finalize(); + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } } // TODO(bsteiner): provide a more efficient implementation. @@ -179,9 +281,20 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < packetSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -190,34 +303,59 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - startInput += index * m_preservedStrides[0]; - return startInput; - } + template friend struct internal::GenericDimReducer; + template friend struct internal::InnerMostDimReducer; - EIGEN_DEVICE_FUNC void reduce(Index firstIndex, int DimIndex, Op& reducer) const { - for (int j = 0; j < m_reducedDims[DimIndex]; ++j) { - const Index input = firstIndex + j * m_reducedStrides[DimIndex]; - if (DimIndex < NumReducedDims-1) { - reduce(input, DimIndex+1, reducer); + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (Layout == ColMajor) { + return index * m_preservedStrides[0]; } else { - reducer.reduce(m_impl.coeff(input)); + return index * m_preservedStrides[NumOutputDims - 1]; } } + Index startInput = 0; + if (Layout == ColMajor) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; } + // Dimensions of the output of the operation. Dimensions m_dimensions; - array m_outputStrides; - array m_preservedStrides; + // Precomputed strides for the output tensor. + array m_outputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + array m_preservedStrides; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. array m_reducedDims; + + // Evaluator for the input expression. TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. Op m_reducer; }; diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp new file mode 100644 index 000000000..8276ae822 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +static void test_default() +{ + Tensor vec(6); + vec.setRandom(); + + // Fixme: we should check that the generated numbers follow a uniform + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + +static void test_normal() +{ + Tensor vec(6); + vec.setRandom>(); + + // Fixme: we should check that the generated numbers follow a gaussian + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + + +struct MyGenerator { + MyGenerator() { } + MyGenerator(const MyGenerator&) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const { + return 3 * element_location; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT int values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = 3 * (packet_location + i); + } + return internal::pload::type>(values); + } +}; + + +static void test_custom() +{ + Tensor vec(6); + vec.setRandom(); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(vec(i), 3*i); + } +} + +void test_cxx11_tensor_random() +{ + CALL_SUBTEST(test_default()); + CALL_SUBTEST(test_normal()); + CALL_SUBTEST(test_custom()); +} -- cgit v1.2.3 From 5692723c588c219bca9523962a4620fe7cc4c4c9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 11:42:52 -0800 Subject: Improved the performance of the contraction code on CUDA --- .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 1976 +++++++++++--------- 1 file changed, 1077 insertions(+), 899 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index babe33fff..f6bd949bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1,7 +1,9 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -19,7 +21,7 @@ template +template __global__ void __launch_bounds__(512) - EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ volatile Scalar lhs_shmem[72 * 64]; - __shared__ volatile Scalar rhs_shmem[72 * 64]; +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); } +} - - template +template __device__ EIGEN_STRONG_INLINE void - EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float4* lhs_shmem4, float2* rhs_shmem2, - const Index m_size, const Index n_size, const Index k_size) { - typedef float Scalar; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - const Index lane = threadIdx.x + 8 * (threadIdx.y % 4); - - // prefetch registers - float4 lhs_pf0; - float4 lhs_pf1; - - float4 rhs_pf0; - float4 rhs_pf1; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // all of these indices assume float4 loading - // this thread loads the float4 starting at this index, and then also loads - // another float4 starting 32 columns to to the right - const Index horiz_block_idx = threadIdx.z / 2; - const Index vert_block_idx = threadIdx.x / 2 + 4 * (threadIdx.y % 2); - const Index horiz_idx_in_block = threadIdx.y / 2 + 4 * (threadIdx.z % 2); - const Index vert_idx_in_block = threadIdx.x % 2; - - // there's padding in both the LHS and RHS shared memory layouts. This padding - // allows for 0 bank conflicts on all shmem stores and loads. - // LHS padding: 1 float4 on each 8x8 block of floats - // RHS padding: 1 float2 on each block, and 12 additional float2s between vertical blocks - // 3 and 4 - - // storage indices - // lhs index with respect to float4s - const Index lhs_store_idx_base = - 136 * horiz_block_idx + - 17 * vert_block_idx + - 8 * vert_idx_in_block + - horiz_idx_in_block; - - // rhs index with respect to floats - const Index rhs_store_idx_base = - 552 * horiz_block_idx + - 66 * vert_block_idx + - 32 * (horiz_idx_in_block / 4) + (horiz_idx_in_block % 4) + - 16 * vert_idx_in_block + - ((vert_block_idx < 4) ? 0 : 24); - - const Index lhs_store_idx_0 = lhs_store_idx_base + 544 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 544 * 1; - - const Index rhs_store_idx_0 = (rhs_store_idx_base / 2) + ((lane < 16) ? 0 : 4); - const Index rhs_store_idx_1 = rhs_store_idx_0 + 2; - const Index rhs_store_idx_2 = rhs_store_idx_0 + 1104; - const Index rhs_store_idx_3 = rhs_store_idx_1 + 1104; - - // The below diagrams show which shmem index (with respect to floats) each element - // in an 8x8 input block gets packed into: - // LHS: - // 0 4 8 12 16 20 24 28 - // 1 5 9 13 17 21 25 29 - // 2 6 10 14 18 22 26 30 - // 3 7 11 15 19 23 27 31 - // 32 36 40 44 48 52 56 60 - // ... (pack as 2 rows of float4 indexed row major, each float4 is vertical) - // - // RHS: - // 0 1 2 3 32 33 34 35 - // 4 5 6 7 36 37 38 39 - // ... (pack as 2 cols of float4 indexed col major, each float4 is horizontal) - - // Each thread in a warp loads 2 float4s. This happens in 2 instructions. On each of these - // instruction, the warp loads 2 columns (2 cols * 64 elements / col = 128 elements = 32 threads - // * 4 elements/thread). For the LHS, we're able to store the loaded float4 directly into - // shmem (using a 128 bit store instruction). For the RHS, we need to transpose the data. - // This is done with warp shuffles. Furthermore, we only use 64 bit stores for the RHS, because - // 64 bits is only 2 columns (which is all we load in a warp), and the padding for the RHS - // doesn't meet 64 bit alignment requirements (namely, the 4 consecutive floats that we want - // to load on the RHS are 8 byte aligned, not 16 byte aligned, which is required for float4). - - const Index load_idx_vert = 4 * (threadIdx.x + 8 * (threadIdx.y % 2)); - const Index load_idx_horiz = (threadIdx.y / 2) + 4 * threadIdx.z; +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; - const Index lhs_vert = base_m + load_idx_vert; - const Index rhs_horiz_0 = base_n + load_idx_horiz; - const Index rhs_horiz_1 = base_n + load_idx_horiz + 32; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = internal::pset1(0); \ - lhs_pf1 = internal::pset1(0); \ - \ - rhs_pf0 = internal::pset1(0); \ - rhs_pf1 = internal::pset1(0); \ - \ - const Index lhs_horiz_0 = base_k + load_idx_horiz; \ - const Index lhs_horiz_1 = base_k + load_idx_horiz + 32; \ - if (!needs_edge_check || lhs_vert + 3 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs.loadPacket(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ - } \ - } else if (lhs_vert + 2 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ - \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ - lhs_pf1.z = lhs(lhs_vert + 2, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ - } \ - } else if (lhs_vert + 1 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - } \ - } else if (lhs_vert < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - } \ -} \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (rhs_vert + 3 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ - } \ - } else if (rhs_vert + 2 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ - \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ - } \ - } else if (rhs_vert + 1 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - } \ - } else if (rhs_vert < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - } \ -} \ - \ - float swap_val0 = (lane < 16) ? rhs_pf0.z : rhs_pf0.x; \ - float swap_val1 = (lane < 16) ? rhs_pf0.w : rhs_pf0.y; \ - float swap_val2 = (lane < 16) ? rhs_pf1.z : rhs_pf1.x; \ - float swap_val3 = (lane < 16) ? rhs_pf1.w : rhs_pf1.y; \ - \ - swap_val0 = __shfl_xor(swap_val0, 16); \ - swap_val1 = __shfl_xor(swap_val1, 16); \ - swap_val2 = __shfl_xor(swap_val2, 16); \ - swap_val3 = __shfl_xor(swap_val3, 16); \ - \ - if (lane < 16) { \ - rhs_pf0.z = swap_val0; \ - rhs_pf0.w = swap_val1; \ - rhs_pf1.z = swap_val2; \ - rhs_pf1.w = swap_val3; \ - } else { \ - rhs_pf0.x = swap_val0; \ - rhs_pf0.y = swap_val1; \ - rhs_pf1.x = swap_val2; \ - rhs_pf1.y = swap_val3; \ - } \ -} \ - - -#define writeRegToShmem(_) \ - lhs_shmem4[lhs_store_idx_0] = lhs_pf0; \ - \ - rhs_shmem2[rhs_store_idx_0] = make_float2(rhs_pf0.x, rhs_pf0.z); \ - rhs_shmem2[rhs_store_idx_1] = make_float2(rhs_pf0.y, rhs_pf0.w); \ - \ - lhs_shmem4[lhs_store_idx_1] = lhs_pf1; \ - \ - rhs_shmem2[rhs_store_idx_2] = make_float2(rhs_pf1.x, rhs_pf1.z); \ - rhs_shmem2[rhs_store_idx_3] = make_float2(rhs_pf1.y, rhs_pf1.w); \ + // prefetch registers + float4 lhs_pf0, rhs_pf0; - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = Scalar(0); \ - Scalar res(i, 1) = Scalar(0); \ - Scalar res(i, 2) = Scalar(0); \ - Scalar res(i, 3) = Scalar(0); \ - Scalar res(i, 4) = Scalar(0); \ - Scalar res(i, 5) = Scalar(0); \ - Scalar res(i, 6) = Scalar(0); \ - Scalar res(i, 7) = Scalar(0); \ + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + __syncthreads(); - prefetchIntoRegisters(base_k); - writeRegToShmem(); + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; -#undef prefetchIntoRegisters -#undef writeRegoToShmem + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - // wait for shared mem packing to be done before starting computation + add_vals(fl1, fl2, fr1, fr2) + } __syncthreads(); + } - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). +#undef prefetch_lhs +#undef add_vals - float4 _lcol0; - float4 _lcol1; - float2 _rrow0; - float2 _rrow1; - float2 _rrow2; - float2 _rrow3; - -#define lcol0 _lcol0.x -#define lcol1 _lcol0.y -#define lcol2 _lcol0.z -#define lcol3 _lcol0.w -#define lcol4 _lcol1.x -#define lcol5 _lcol1.y -#define lcol6 _lcol1.z -#define lcol7 _lcol1.w -#define rrow0 _rrow0.x -#define rrow1 _rrow0.y -#define rrow2 _rrow1.x -#define rrow3 _rrow1.y -#define rrow4 _rrow2.x -#define rrow5 _rrow2.y -#define rrow6 _rrow3.x -#define rrow7 _rrow3.y + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} - // Now x corresponds to k, y to m, and z to n - const float4* lhs_block = &lhs_shmem4[threadIdx.x + 8 * (threadIdx.y % 2) + 17 * (threadIdx.y / 2)]; - const float2* rhs_block = &rhs_shmem2[2 * threadIdx.x + 16 * (threadIdx.z % 2) + 276 * (threadIdx.z / 2)]; - -#define lhs_element(i, k) lhs_block[68 * i + 136 * k] -#define rhs_element(k, j) rhs_block[33 * k + 1104 * j + ((k < 4) ? 0 : 12)] - -#define loadData(i) \ - _lcol0 = lhs_element(0, i); \ - _rrow0 = rhs_element(i, 0); \ - _rrow1 = *(&(rhs_element(i, 0)) + 1); \ - _lcol1 = lhs_element(1, i); \ - _rrow2 = rhs_element(i, 1); \ - _rrow3 = *(&(rhs_element(i, 1)) + 1); \ - -#define computeCol(j) \ - res(0, j) += lcol0 * rrow##j; \ - res(1, j) += lcol1 * rrow##j; \ - res(2, j) += lcol2 * rrow##j; \ - res(3, j) += lcol3 * rrow##j; \ - res(4, j) += lcol4 * rrow##j; \ - res(5, j) += lcol5 * rrow##j; \ - res(6, j) += lcol6 * rrow##j; \ - res(7, j) += lcol7 * rrow##j; \ -#define computePass(i) \ - loadData(i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol0 -#undef lcol1 -#undef lcol2 -#undef lcol3 -#undef lcol4 -#undef lcol5 -#undef lcol6 -#undef lcol7 -#undef rrow0 -#undef rrow1 -#undef rrow2 -#undef rrow3 -#undef rrow4 -#undef rrow5 -#undef rrow6 -#undef rrow7 +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; -#undef computePass -#undef computeCol -#undef loadData -#undef lhs_element -#undef rhs_element + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; - } // end loop over k + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); + __syncthreads(); -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (3) Copies the values into new registers using conditional logic. - -#define makeAssignments(i) \ - val0 = res(i, 0); \ - val1 = res(i, 1); \ - val2 = res(i, 2); \ - val3 = res(i, 3); \ - val4 = res(i, 4); \ - val5 = res(i, 5); \ - val6 = res(i, 6); \ - val7 = res(i, 7); \ - - Scalar val0; - Scalar val1; - Scalar val2; - Scalar val3; - Scalar val4; - Scalar val5; - Scalar val6; - Scalar val7; - - switch (threadIdx.x) { - case 0: - makeAssignments(0); - break; - case 1: - makeAssignments(1); - break; - case 2: - makeAssignments(2); - break; - case 3: - makeAssignments(3); - break; - case 4: - makeAssignments(4); - break; - case 5: - makeAssignments(5); - break; - case 6: - makeAssignments(6); - break; - case 7: - makeAssignments(7); - break; - } + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; -#undef res + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - const Index vert_base = base_m + 4 * threadIdx.y + (threadIdx.x % 4) + 32 * (threadIdx.x / 4); - const Index horiz_base = base_n + 4 * threadIdx.z; - - if (!needs_edge_check || vert_base < m_size) { - if (!needs_edge_check || horiz_base + 35 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - output(vert_base, horiz_base + 34) = val6; - output(vert_base, horiz_base + 35) = val7; - } else if (horiz_base + 34 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - output(vert_base, horiz_base + 34) = val6; - } else if (horiz_base + 33 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - } else if (horiz_base + 32 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - } else if (horiz_base + 3 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - } else if (horiz_base + 2 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - } else if (horiz_base + 1 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - } else if (horiz_base < n_size) { - output(vert_base, horiz_base + 0) = val0; - } - } + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } } +} - template +template __global__ void - __launch_bounds__(512) - EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float4 lhs_shmem[(68 * 64) / 4]; - __shared__ float2 rhs_shmem[((66 * 8 + 24) * 8) / 2]; +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + bool check_lhs64 = (base_m + 63) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } } else { - EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } } +} + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; - template - struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; - typedef GpuDevice Device; + enum { + Layout = TensorEvaluator::Layout, + }; - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; - typedef array::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value> right_nocontract_t; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int NumDims = max_n_1::size; - typedef DSizes Dimensions; + typedef DSizes Dimensions; - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} - // We need to redefine this method to make nvcc happy - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); } } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); } else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } + evalTyped(buffer); } } + } + } - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; - // rows in left side - const Index m = this->m_i_size; + // rows in left side + const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; + // columns in right side + const Index n = this->m_j_size; - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - typedef internal::TensorContractionInputMapper LhsMapper; + typedef internal::TensorContractionInputMapper LhsMapper; - typedef internal::TensorContractionInputMapper RhsMapper; + typedef internal::TensorContractionInputMapper RhsMapper; - typedef internal::blas_data_mapper OutputMapper; + typedef internal::blas_data_mapper OutputMapper; - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); - OutputMapper output(buffer, m); + OutputMapper output(buffer, m); + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same::value && + internal::is_same::value) { + if (m < 768 || n < 768) { const Index m_blocks = (m + 63) / 64; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - - cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); - if (internal::is_same::value && - internal::is_same::value) { - EigenFloatContractionKernel - <<m_device.stream()>>>(lhs, rhs, output, m, n, k); - } else { - EigenContractionKernel - <<m_device.stream()>>>(lhs, rhs, output, m, n, k); - } - - assert(cudaGetLastError() == cudaSuccess); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); } - }; + } else { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } +}; } // end namespace Eigen #endif // EIGEN_USE_GPU and __CUDACC__ - #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -- cgit v1.2.3 From 0a0ab6dd158e3f4471ba1fe20454de35b18fdce5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 11:45:17 -0800 Subject: Increased the functionality of the tensor devices --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index bb05e4177..efd207507 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -43,11 +43,14 @@ typedef std::promise Promise; static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) { f->wait(); - // eigen_assert(f->ready()); } +static EIGEN_STRONG_INLINE void get_when_ready(Future* f) { + f->get(); +} + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } + ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); @@ -79,9 +82,9 @@ struct ThreadPoolDevice { } private: - // todo: NUMA, ... size_t num_threads_; }; + #endif @@ -114,6 +117,10 @@ static inline int sharedMemPerBlock() { return m_deviceProperties.sharedMemPerBlock; } +static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + assert(status == cudaSuccess); +} struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. @@ -163,10 +170,19 @@ struct GpuDevice { return 32; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + cudaStreamSynchronize(*stream_); + } + private: // TODO: multigpu. const cudaStream_t* stream_; }; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + #endif } // end namespace Eigen -- cgit v1.2.3 From 71676eaddd7fb6b8abdc5713f437750f3c963fcb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:36:57 -0800 Subject: Added support for RowMajor inputs to the contraction code. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 265 +++++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 43 +++- 3 files changed, 220 insertions(+), 94 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c5ec42cf4..a02a273e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -320,6 +320,8 @@ class TensorContractionInputMapper }; + + template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + template struct traits > { @@ -378,6 +388,10 @@ struct traits > typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + // From NumDims below. + static const int NumDimensions = max_n_1::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; + static const int Layout = traits::Layout; + enum { Flags = 0, }; @@ -401,19 +415,19 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; }; } // end namespace internal - - template class TensorContractionOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; typedef typename internal::promote_storage_type::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } protected: typename LhsXprType::Nested m_lhs_xpr; @@ -444,12 +459,17 @@ class TensorContractionOp : public TensorBase struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} template @@ -467,37 +487,94 @@ struct TensorContractionEvaluatorBase typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef array::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - - typedef DSizes Dimensions; - enum { IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) - { + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = internal::max_n_1::size; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == + TensorEvaluator::Layout), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); - array::Dimensions::count> lhs_strides; + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (Layout == ColMajor) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + } + } + + array lhs_strides; lhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; } - array::Dimensions::count> rhs_strides; + array rhs_strides; rhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; } m_i_strides[0] = 1; @@ -515,27 +592,28 @@ struct TensorContractionEvaluatorBase m_lhs_inner_dim_contiguous = true; int dim_idx = 0; int nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + + for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor bool contracting = false; - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].first == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { contracting = true; break; } } if (!contracting) { // add dimension size to output dimensions - m_dimensions[dim_idx] = left_dims[i]; + m_dimensions[dim_idx] = eval_left_dims[i]; m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; if (dim_idx != i) { m_lhs_inner_dim_contiguous = false; } if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; } else { - m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; } dim_idx++; nocontract_idx++; @@ -543,22 +621,22 @@ struct TensorContractionEvaluatorBase } nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + for (int i = 0; i < RDims; i++) { bool contracting = false; // find if we are contracting on index i of right tensor - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].second == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { contracting = true; break; } } if (!contracting) { - m_dimensions[dim_idx] = right_dims[i]; + m_dimensions[dim_idx] = eval_right_dims[i]; if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; } else { - m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; } m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; dim_idx++; @@ -573,12 +651,13 @@ struct TensorContractionEvaluatorBase // each tensor, we'll only look at the first tensor here. m_rhs_inner_dim_contiguous = true; m_rhs_inner_dim_reordered = false; - for (int i = 0; i < internal::array_size::value; i++) { - Index left = op.indices()[i].first; - Index right = op.indices()[i].second; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; - Index size = left_dims[left]; - eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); if (i+1 < internal::array_size::value) { m_k_strides[i+1] = m_k_strides[i] * size; @@ -588,7 +667,7 @@ struct TensorContractionEvaluatorBase m_left_contracting_strides[i] = lhs_strides[left]; m_right_contracting_strides[i] = rhs_strides[right]; - if (i > 0 && right < op.indices()[i-1].second) { + if (i > 0 && right < eval_op_indices[i-1].second) { m_rhs_inner_dim_reordered = true; } if (right != i) { @@ -597,9 +676,16 @@ struct TensorContractionEvaluatorBase } // Scalar case. We represent the result as a 1d tensor of size 1. - if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (LDims + RDims == 2 * ContractDims) { m_dimensions[0] = 1; } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (Layout == RowMajor) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + std::swap(m_dimensions[i], m_dimensions[j]); + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -661,10 +747,10 @@ struct TensorContractionEvaluatorBase const Index rows = m_i_size; const Index cols = m_k_size; - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; typedef internal::TensorContractionInputMapper m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; }; +// evaluator for default device template struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase, Device> > { + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; @@ -759,15 +846,35 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + static const int NumDims = internal::max_n_1::size; + + // Could we use NumDimensions here? typedef DSizes Dimensions; @@ -799,15 +906,15 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; const Index nr = Traits::nr; const Index mr = Traits::mr; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; @@ -826,10 +933,10 @@ struct TensorEvaluator OutputMapper; - // Declare GEBP packing and kernel structs internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; // initialize data mappers diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index f6bd949bd..588770bb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1241,10 +1241,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = internal::max_n_1::size; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index f0e9bb616..5851e5adc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -70,24 +70,43 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + static const int NumDims = max_n_1::size; typedef DSizes Dimensions; // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} -- cgit v1.2.3 From b12dd1ae3cc4077740dded430bc244623a6cc3b8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:39:34 -0800 Subject: Misc improvements for fixed size tensors --- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 32 ++++++++++++++++++---- unsupported/test/cxx11_tensor_fixed_size.cpp | 13 +++++---- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 1af2d7bcd..94b3f957b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,7 +42,9 @@ class TensorFixedSize : public TensorBase::size > 1), - }; + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; @@ -51,11 +53,12 @@ class TensorFixedSize : public TensorBase m_storage; public: - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED // work, because that uses base().coeffRef() - and we don't yet @@ -187,6 +190,23 @@ class TensorFixedSize : public TensorBase Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 99ffc7f07..8a27f5ad8 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -32,13 +32,14 @@ static void test_1d() vec1(5) = 42.0; vec2(5) = 5.0; float data3[6]; - TensorMap > > vec3(data3, Sizes<6>()); + TensorMap > > vec3(data3, 6); vec3 = vec1.sqrt(); float data4[6]; - TensorMap, RowMajor> > vec4(data4, Sizes<6>()); + TensorMap, RowMajor> > vec4(data4, 6); vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); + VERIFY_IS_EQUAL(vec3.rank(), 1); // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); // VERIFY_IS_EQUAL((vec3.dimension(0)), 6); @@ -68,11 +69,12 @@ static void test_1d() static void test_2d() { float data1[6]; - TensorMap >> mat1(data1, Sizes<2, 3>()); + TensorMap >> mat1(data1,2,3); float data2[6]; - TensorMap, RowMajor>> mat2(data2, Sizes<2, 3>()); + TensorMap, RowMajor>> mat2(data2,2,3); VERIFY_IS_EQUAL((mat1.size()), 2*3); + VERIFY_IS_EQUAL(mat1.rank(), 2); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); @@ -120,6 +122,7 @@ static void test_3d() TensorFixedSize, RowMajor> mat2; VERIFY_IS_EQUAL((mat1.size()), 2*3*7); + VERIFY_IS_EQUAL(mat1.rank(), 3); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); @@ -166,7 +169,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - mat1(array{{i,j,k}}) = val; + mat1(i,j,k) = val; val += 1.0; } } -- cgit v1.2.3 From 7e0b6c56b45be9adf002e59f97902c8a760519af Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:41:30 -0800 Subject: Added ability to initialize a tensor using an initializer list --- .../Eigen/CXX11/src/Tensor/TensorInitializer.h | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 000000000..6afef0fbb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + +#include + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list< + typename Initializer::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + // Static initialization not implemented for VarDims tensors. + eigen_assert(false); + } +}; + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H -- cgit v1.2.3 From 1a36590e8475f688ef42122c0dd96f7a3b89654e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:43:20 -0800 Subject: Fixed the printing of RowMajor tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 15 +++++-- unsupported/test/cxx11_tensor_io.cpp | 58 +++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h index 959b5db73..a9d0f6c39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -12,6 +12,14 @@ namespace Eigen { +namespace internal { +template<> +struct significant_decimals_impl + : significant_decimals_default_impl +{}; +} + + template std::ostream& operator << (std::ostream& os, const TensorBase& expr) { // Evaluate the expression if needed @@ -19,18 +27,19 @@ std::ostream& operator << (std::ostream& os, const TensorBase, DefaultDevice> tensor(eval, DefaultDevice()); tensor.evalSubExprsIfNeeded(NULL); - typedef typename T::Scalar Scalar; + typedef typename internal::remove_const::type Scalar; typedef typename T::Index Index; typedef typename TensorEvaluator, DefaultDevice>::Dimensions Dimensions; const Index total_size = internal::array_prod(tensor.dimensions()); // Print the tensor as a 1d vector or a 2d matrix. if (internal::array_size::value == 1) { - Map > array(tensor.data(), total_size); + Map > array(const_cast(tensor.data()), total_size); os << array; } else { const Index first_dim = tensor.dimensions()[0]; - Map > matrix(tensor.data(), first_dim, total_size/first_dim); + static const int layout = TensorEvaluator, DefaultDevice>::Layout; + Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); os << matrix; } diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index b73c024f5..8bbcf7089 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -13,9 +13,10 @@ #include +template static void test_output_1d() { - Tensor tensor(5); + Tensor tensor(5); for (int i = 0; i < 5; ++i) { tensor(i) = i; } @@ -28,9 +29,10 @@ static void test_output_1d() } +template static void test_output_2d() { - Tensor tensor(5, 3); + Tensor tensor(5, 3); for (int i = 0; i < 5; ++i) { for (int j = 0; j < 3; ++j) { tensor(i, j) = i*j; @@ -45,10 +47,11 @@ static void test_output_2d() } +template static void test_output_expr() { - Tensor tensor1(5); - Tensor tensor2(5); + Tensor tensor1(5); + Tensor tensor2(5); for (int i = 0; i < 5; ++i) { tensor1(i) = i; tensor2(i) = 7; @@ -62,9 +65,50 @@ static void test_output_expr() } +template +static void test_output_string() +{ + Tensor tensor(5, 3); + tensor.setConstant(std::string("foo")); + + std::cout << tensor << std::endl; + + std::stringstream os; + os << tensor; + + std::string expected("foo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template +static void test_output_const() +{ + Tensor tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + TensorMap > tensor_map(tensor.data(), 5); + + std::stringstream os; + os << tensor_map; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + void test_cxx11_tensor_io() { - CALL_SUBTEST(test_output_1d()); - CALL_SUBTEST(test_output_2d()); - CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_string()); + CALL_SUBTEST(test_output_string()); + CALL_SUBTEST(test_output_const()); + CALL_SUBTEST(test_output_const()); } -- cgit v1.2.3 From 0526dc1bb4091c484f5a0dab71818f48c0d4fc5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:44:08 -0800 Subject: Added missing apis to the tensor class --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 105 ++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index ceed09505..e125ca799 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -77,18 +77,20 @@ class Tensor : public TensorBase > enum { IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits::size > 1), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, }; static const int Options = Options_; - static const std::size_t NumIndices = NumIndices_; - - typedef DSizes Dimensions; + typedef DSizes Dimensions; protected: TensorStorage m_storage; public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } @@ -153,6 +155,27 @@ class Tensor : public TensorBase > EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array(i0, i1, i2, i3, i4)); + } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const @@ -182,6 +205,27 @@ class Tensor : public TensorBase > EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) return operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array(i0, i1, i2, i3, i4)); + } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) @@ -223,6 +267,32 @@ class Tensor : public TensorBase > // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } +#else + inline explicit Tensor(Index dim1) + : m_storage(dim1, array(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif inline explicit Tensor(const array& dimensions) @@ -231,24 +301,24 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { typedef TensorAssignOp Assign; Assign assign(*this, other.derived()); resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) @@ -297,7 +367,16 @@ class Tensor : public TensorBase > #endif } + void resize(const DSizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + protected: + bool checkIndexRange(const array& indices) const { using internal::array_apply_and_reduce; -- cgit v1.2.3 From 378bdfb7f0c4b2a8eb2b91c2a65f3bc1c57e689e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:45:20 -0800 Subject: Added missing apis to the TensorMap class --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 0a8c10ac7..2cb2bc7a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -48,6 +48,8 @@ template class TensorMap : public Tensor enum { IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits::size > 1), + Layout = PlainObjectType::Layout, + CoordAccess = true, }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES @@ -62,13 +64,35 @@ template class TensorMap : public Tensor // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif + inline TensorMap(PointerArgType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + template - inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC -- cgit v1.2.3 From 1ac86001266db55b78086617fb68206b29748919 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:47:46 -0800 Subject: Fixed the return type of coefficient wise operations. For example, the abs function returns a floating point value when called on a complex input. --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 58 ++++++++++++++- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 87 +++++++++++++--------- 2 files changed, 107 insertions(+), 38 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f7c784942..97f225f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -34,9 +34,15 @@ struct TensorEvaluator typedef typename Derived::Packet PacketReturnType; typedef typename Derived::Dimensions Dimensions; + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -77,6 +83,24 @@ struct TensorEvaluator return internal::pstoret(m_data + index, x); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + if (Layout == ColMajor) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + eigen_assert(m_data); + if (Layout == ColMajor) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + Scalar* data() const { return m_data; } protected: @@ -97,9 +121,15 @@ struct TensorEvaluator typedef typename Derived::Packet PacketReturnType; typedef typename Derived::Dimensions Dimensions; + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) @@ -126,6 +156,17 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + const Index index = (Layout == ColMajor) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else + return m_data[index]; +#endif + } + const Scalar* data() const { return m_data; } protected: @@ -146,6 +187,8 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC @@ -194,6 +237,8 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -247,6 +292,8 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -254,7 +301,8 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); } typedef typename XprType::Index Index; @@ -309,6 +357,8 @@ struct TensorEvaluator IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & TensorEvaluator::PacketAccess*/, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -316,8 +366,10 @@ struct TensorEvaluator m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) { - eigen_assert(internal::dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(internal::dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); } typedef typename XprType::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 6e5503de1..b66b3ec2c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -17,14 +17,14 @@ namespace Eigen { * * \brief Tensor expression classes. * - * The TensorCwiseNullaryOp class applies a nullary operators to an expression. This - * is typically used to generate constants. + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. * * The TensorCwiseUnaryOp class represents an expression where a unary operator * (e.g. cwiseSqrt) is applied to an expression. * - * The TensorCwiseBinaryOp class represents an expression where a binary operator - * (e.g. addition) is applied to a lhs and a rhs expression. + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. * */ namespace internal { @@ -33,9 +33,12 @@ struct traits > : traits { typedef typename XprType::Packet Packet; + typedef traits XprTraits; typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -47,7 +50,7 @@ struct traits > template -class TensorCwiseNullaryOp : public TensorBase > +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -81,12 +84,15 @@ template struct traits > : traits { - typedef typename result_of< - UnaryOp(typename XprType::Scalar) - >::type Scalar; + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; typedef typename internal::packet_traits::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -106,14 +112,16 @@ struct nested, 1, typename eval -class TensorCwiseUnaryOp : public TensorBase > +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> { public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -139,22 +147,27 @@ namespace internal { template struct traits > { - // Type promotion to handle the case where the types of the lhs and the rhs are different. + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. typedef typename result_of< - BinaryOp( - typename LhsXprType::Scalar, - typename RhsXprType::Scalar - ) - >::type Scalar; + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits XprTraits; typedef typename internal::packet_traits::type Packet; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; + typedef typename promote_storage_type< + typename traits::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits::Index, + typename traits::Index>::type Index; typedef typename LhsXprType::Nested LhsNested; typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -178,21 +191,22 @@ struct nested, 1, typename template -class TensorCwiseBinaryOp : public TensorBase > +class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; } @@ -219,7 +233,8 @@ struct traits > : traits { typedef typename traits::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -227,6 +242,8 @@ struct traits > typedef typename IfXprType::Nested IfNested; typedef typename ThenXprType::Nested ThenNested; typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template -- cgit v1.2.3 From 0feff6e987750a61f0ee14774efaef85d2fb6fac Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:29:48 -0800 Subject: Expanded the functionality of index lists --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 105 ++++++++++++++++- unsupported/test/cxx11_tensor_index_list.cpp | 131 +++++++++++++++++++++ 2 files changed, 231 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index eaf0195ce..209749042 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -95,6 +95,20 @@ struct tuple_coeff { return ((i == Idx) & is_compile_time_constant >::type>::value) || tuple_coeff::value_known_statically(i, t); } + + template + static constexpr bool values_up_to_known_statically(const std::tuple& t) { + return is_compile_time_constant >::type>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + return is_compile_time_constant >::type>::value && + is_compile_time_constant >::type>::value && + std::get(t) > std::get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } }; template <> @@ -110,10 +124,20 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } + + template + static constexpr bool values_up_to_known_statically(const std::tuple& t) { + return is_compile_time_constant >::type>::value; + } + + template + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + return true; + } }; } // namespace internal @@ -133,6 +157,13 @@ struct IndexList : std::tuple { constexpr bool value_known_statically(const DenseIndex i) const { return internal::tuple_coeff >::value-1>::value_known_statically(i, *this); } + constexpr bool all_values_known_statically() const { + return internal::tuple_coeff >::value-1>::values_up_to_known_statically(*this); + } + + constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff >::value-1>::values_up_to_statically_known_to_increase(*this); + } }; @@ -144,6 +175,14 @@ constexpr IndexList make_index_list(FirstType val1, Ot namespace internal { +template size_t array_prod(const IndexList& sizes) { + size_t result = 1; + for (int i = 0; i < array_size >::value; ++i) { + result *= sizes[i]; + } + return result; +} + template struct array_size > { static const size_t value = std::tuple_size >::value; }; @@ -179,6 +218,48 @@ struct index_known_statically > { } }; +template +struct all_indices_known_statically { + constexpr bool operator() () const { + return false; + } +}; + +template +struct all_indices_known_statically > { + constexpr bool operator() () const { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically > { + constexpr bool operator() () const { + return IndexList().all_values_known_statically(); + } +}; + +template +struct indices_statically_known_to_increase { + constexpr bool operator() () const { + return false; + } +}; + +template +struct indices_statically_known_to_increase > { + constexpr bool operator() () const { + return IndexList().values_statically_known_to_increase(); + } +}; + +template +struct indices_statically_known_to_increase > { + constexpr bool operator() () const { + return IndexList().values_statically_known_to_increase(); + } +}; + template struct index_statically_eq { constexpr bool operator() (DenseIndex, DenseIndex) const { @@ -190,7 +271,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] == value); + IndexList()[i] == value; } }; @@ -198,7 +279,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] == value); + IndexList()[i] == value; } }; @@ -213,7 +294,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] != value); + IndexList()[i] != value; } }; @@ -221,7 +302,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] != value); + IndexList()[i] != value; } }; @@ -242,6 +323,20 @@ struct index_known_statically { } }; +template +struct all_indices_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template +struct indices_statically_known_to_increase { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + template struct index_statically_eq { EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index 6a103cab1..d79a3ed45 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -44,6 +44,120 @@ static void test_static_index_list() } +static void test_type2index_list() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + tensor += tensor.constant(10.0f); + + typedef Eigen::IndexList> Dims0; + typedef Eigen::IndexList, Eigen::type2index<1>> Dims1; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>> Dims2; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4; + +#if 0 + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const Dims0 reduction_axis0; + Tensor result0 = tensor.sum(reduction_axis0); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + VERIFY_IS_APPROX(result0(j,k,l,m), expected); + } + } + } + } + + const Dims1 reduction_axis1; + Tensor result1 = tensor.sum(reduction_axis1); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + float expected = 0.0f; + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + VERIFY_IS_APPROX(result1(k,l,m), expected); + } + } + } + + const Dims2 reduction_axis2; + Tensor result2 = tensor.sum(reduction_axis2); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + float expected = 0.0f; + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + VERIFY_IS_APPROX(result2(l,m), expected); + } + } + + const Dims3 reduction_axis3; + Tensor result3 = tensor.sum(reduction_axis3); + for (int m = 0; m < 11; ++m) { + float expected = 0.0f; + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + VERIFY_IS_APPROX(result3(m), expected); + } + + const Dims4 reduction_axis4; + Tensor result4 = tensor.sum(reduction_axis4); + float expected = 0.0f; + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + } + VERIFY_IS_APPROX(result4(0), expected); +} + + static void test_dynamic_index_list() { Tensor tensor(2,3,5,7); @@ -105,10 +219,25 @@ static void test_mixed_index_list() EIGEN_STATIC_ASSERT((internal::index_known_statically()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + typedef IndexList, type2index<1>, type2index<2>, type2index<3>> ReductionList; + ReductionList reduction_list; + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif Tensor result1 = tensor.sum(reduction_axis); Tensor result2 = tensor.sum(reduction_indices); + Tensor result3 = tensor.sum(reduction_list); float expected = 0.0f; for (int i = 0; i < 2; ++i) { @@ -122,12 +251,14 @@ static void test_mixed_index_list() } VERIFY_IS_APPROX(result1(0), expected); VERIFY_IS_APPROX(result2(0), expected); + VERIFY_IS_APPROX(result3(0), expected); } void test_cxx11_tensor_index_list() { CALL_SUBTEST(test_static_index_list()); + CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); } -- cgit v1.2.3 From 4cdf3fe427b4fdc271733d0404a66e2d5613cb16 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:30:47 -0800 Subject: Misc fixes --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 130 +++++++-------------- 1 file changed, 41 insertions(+), 89 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 6d9e09318..6c9a67c58 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -40,6 +40,10 @@ template struct IndexPair { // Boilerplate code namespace internal { +template struct dget { + static const std::size_t value = get::value; +}; + template struct fixed_size_tensor_index_linearization_helper @@ -49,7 +53,7 @@ struct fixed_size_tensor_index_linearization_helper const Dimensions& dimensions) { return array_get(indices) + - get::value * + dget::value * fixed_size_tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -75,6 +79,10 @@ struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; static const std::size_t total_size = internal::arg_prod(Indices...); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return Base::count; + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { return internal::arg_prod(Indices...); } @@ -85,6 +93,7 @@ struct Sizes : internal::numeric_list { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template Sizes(DenseIndex... indices) { } explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } @@ -121,11 +130,15 @@ struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes : typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type { +template struct Sizes { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; static const size_t count = Base::count; static const std::size_t total_size = internal::arg_prod::value; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { return internal::arg_prod::value; } @@ -160,11 +173,11 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this); } }; @@ -208,6 +221,10 @@ struct DSizes : array { typedef array Base; static const std::size_t count = NumDims; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } @@ -219,31 +236,44 @@ struct DSizes : array { } EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + (*this) = array{{firstDimension, otherDimensions...}}; + } +#else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); (*this)[0] = i0; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); (*this)[0] = i0; (*this)[1] = i1; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; (*this)[4] = i4; } +#endif DSizes& operator = (const array& other) { *static_cast(this) = other; @@ -287,84 +317,6 @@ struct tensor_vsize_index_linearization_helper }; } // end namespace internal -template -struct VSizes : std::vector { - typedef std::vector Base; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { - return internal::array_prod(*static_cast(this)); - } - - EIGEN_DEVICE_FUNC VSizes() { } - EIGEN_DEVICE_FUNC explicit VSizes(const std::vector& a) : Base(a) { } - - template - EIGEN_DEVICE_FUNC explicit VSizes(const array& a) { - this->resize(NumDims); - for (int i = 0; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } - - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) { - this->resize(1); - (*this)[0] = i0; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) { - this->resize(2); - (*this)[0] = i0; - (*this)[1] = i1; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { - this->resize(3); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { - this->resize(4); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { - this->resize(5); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - (*this)[4] = i4; - } - - VSizes& operator = (const std::vector& other) { - *static_cast(this) = other; - return *this; - } - - // A constexpr would be so much better here - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - - -// Boilerplate -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes& sizes) { - DenseIndex total_size = 1; - for (int i = 0; i < sizes.size(); ++i) { - total_size *= sizes[i]; - } - return total_size; -} -} namespace internal { @@ -381,8 +333,8 @@ static const size_t value = Sizes::count; template struct array_size > { static const size_t value = Sizes::count; }; - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { - return get::Base>::value; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { + return get >::value; } #else template struct array_size > { @@ -412,17 +364,17 @@ struct sizes_match_up_to_dim { } }; +} // end namespace internal + + template bool dimensions_match(Dims1& dims1, Dims2& dims2) { - if (array_size::value != array_size::value) { + if (internal::array_size::value != internal::array_size::value) { return false; } - return sizes_match_up_to_dim::value-1>::run(dims1, dims2); + return internal::sizes_match_up_to_dim::value-1>::run(dims1, dims2); } -} // end namespace internal - - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H -- cgit v1.2.3 From 703c526355c929cc6c422b7599ecfed57642e988 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:31:52 -0800 Subject: Misc improvements --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 20 ++++-- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 12 ++-- .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 81 ++++++++++++++++++---- 3 files changed, 86 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 1e6b97ce4..36d91e780 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; @@ -370,6 +370,14 @@ constexpr inline auto array_prod(std::array arr) -> decltype(array_reduce< return array_reduce(arr); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + /* zip an array */ template diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index e30eb6ad8..a590cf4e1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -48,15 +48,13 @@ namespace internal { * - libstdc++ from version 4.7 onwards has it nevertheless, * so use that * - libstdc++ older versions: use _M_instance directly - * - libc++ from version 3.4 onwards has it IF compiled with - * -std=c++1y - * - libc++ older versions or -std=c++11: use __elems_ directly + * - libc++ all versions so far: use __elems_ directly * - all other libs: use std::get to be portable, but * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 #define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11) +#elif defined(_LIBCPP_VERSION) #define STD_GET_ARR_HACK a.__elems_[I] #else #define STD_GET_ARR_HACK std::template get(a) @@ -70,14 +68,14 @@ template constexpr inline T& array_get(std::vector template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } - #undef STD_GET_ARR_HACK template struct array_size; -template struct array_size > { +template struct array_size > { static const size_t value = N; }; -template struct array_size > { +template struct array_size; +template struct array_size > { static const size_t value = N; }; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index e45d0a3b1..494f95690 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -29,7 +29,7 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } - EIGEN_DEVICE_FUNC + explicit EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v) { EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v; @@ -106,6 +106,7 @@ template class array { #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { + eigen_assert(l.size() == n); std::copy(l.begin(), l.end(), values); } #endif @@ -211,6 +212,29 @@ template struct gen_numeric_list_repeated { template struct get; +template +struct get +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get > +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + template struct get<0, type_list > { @@ -221,10 +245,11 @@ struct get<0, type_list > template struct get > { - typedef typename get::type type; + typedef typename Tail::HeadType::type type; static const type value = get::value; }; + template struct arg_prod { static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; }; @@ -354,23 +379,51 @@ struct greater_equal_zero_op { template -inline bool array_apply_and_reduce(const array& a) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) - bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i])); +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; } - return result; +}; + +template +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + return Op::run(a[0]); + } +}; + +template +inline bool array_apply_and_reduce(const array& a) { + return ArrayApplyAndReduce::run(a); } template -inline bool array_zip_and_reduce(const array& a, const array& b) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) - bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i], b[i])); +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; } - return result; +}; + +template +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + return Op::run(a[0], b[0]); + } +}; + +template +inline bool array_zip_and_reduce(const array& a, const array& b) { + return ArrayZipAndReduce::run(a, b); } } // end namespace internal -- cgit v1.2.3 From 8a382aa119274efd2eb73b822ae7cd2afa128cc5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:33:11 -0800 Subject: Improved the resizing of tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index aaec39756..dfe85602a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -114,16 +114,12 @@ class TensorStorage& dimensions() const {return m_dimensions;} - void conservativeResize(DenseIndex size, const array& nbDimensions) - { - m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, internal::array_prod(m_dimensions)); - m_dimensions = nbDimensions; - } void resize(DenseIndex size, const array& nbDimensions) { - if(size != internal::array_prod(m_dimensions)) + const DenseIndex currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) { - internal::conditional_aligned_delete_auto(m_data, internal::array_prod(m_dimensions)); + internal::conditional_aligned_delete_auto(m_data, currentSz); if (size) m_data = internal::conditional_aligned_new_auto(size); else @@ -139,8 +135,6 @@ class TensorStorage Date: Wed, 14 Jan 2015 15:34:50 -0800 Subject: Ensured that each thread has it's own copy of the TensorEvaluator: this avoid race conditions when the evaluator calls a non thread safe functor, eg when generating random numbers. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 27 +++++++++++----------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f27f643c1..d93fdd907 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -77,17 +77,17 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); for (Index i = first; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; template struct EvalRange { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); Index i = first; @@ -96,12 +96,12 @@ struct EvalRange { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { - evaluator->evalPacket(i); + evaluator.evalPacket(i); } } for (; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; @@ -130,16 +130,17 @@ class TensorExecutor std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); } - for (int i = 0; i < numblocks; ++i) { - results[i].get(); + if (numblocks * blocksize < size) { + EvalRange::run(evaluator, numblocks * blocksize, size); } - if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + for (int i = 0; i < numblocks; ++i) { + get_when_ready(&results[i]); } + } evaluator.cleanup(); } @@ -168,7 +169,8 @@ __launch_bounds__(1024) const Index PacketSize = unpacket_traits::size; const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) { + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { eval.evalPacket(i); } for (Index i = vectorized_size + first_index; i < size; i += step_size) { @@ -192,8 +194,7 @@ class TensorExecutor const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel, Index><<>>(evaluator, size); - assert(cudaGetLastError() == cudaSuccess); + LAUNCH_CUDA_KERNEL((EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } -- cgit v1.2.3 From f697df723798779bc29d9f7299bb5398767d5db0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:38:48 -0800 Subject: Improved support for RowMajor tensors Misc fixes and API cleanups. --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 315 ++++++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 166 +++++++++-- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 208 +++++++++++--- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 75 +++-- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 50 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 33 ++- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 24 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 142 ++++++++-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 223 +++++++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 171 +++++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 46 ++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 54 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 175 ++++++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 53 ++++ 17 files changed, 1403 insertions(+), 354 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index e973c00d3..93938bd1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -33,6 +33,8 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const std::size_t NumDimensions = internal::traits::NumDimensions; + static const int Layout = internal::traits::Layout; enum { Flags = 0, @@ -94,12 +96,18 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) - { } + { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -114,7 +122,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - eigen_assert(internal::dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non // null value), attempt to evaluate the rhs expression in place. Returns true iff in place diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index f451a3c99..8860f622b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -25,77 +25,118 @@ template class TensorBase { public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::Index Index; - typedef Scalar CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; - // Dimensions - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } - // Nullary operators + // Coefficient-wise nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> constant(const Scalar& value) const { - return TensorCwiseNullaryOp, const Derived> - (derived(), internal::scalar_constant_op(value)); + return nullaryExpr(internal::scalar_constant_op(value)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(derived()); + return nullaryExpr(internal::UniformRandomGenerator()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp random() const { - return TensorCwiseNullaryOp(derived()); + return nullaryExpr(RandomGenerator()); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); } // Coefficient-wise unary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator-() const { return derived(); } + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sqrt() const { return derived(); } + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - square() const { return derived(); } + square() const { + return unaryExpr(internal::scalar_square_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - inverse() const { return derived(); } + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - exp() const { return derived(); } + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log() const { return derived(); } + log() const { + return unaryExpr(internal::scalar_log_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - abs() const { return derived(); } + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> pow(Scalar exponent) const { - return TensorCwiseUnaryOp, const Derived> - (derived(), internal::scalar_pow_op(exponent)); + return unaryExpr(internal::scalar_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::scalar_add_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((std::numeric_limits::is_signed || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_sub_op(rhs)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator * (Scalar scale) const { - return TensorCwiseUnaryOp, const Derived> - (derived(), internal::scalar_multiple_op(scale)); + operator* (Scalar rhs) const { + return unaryExpr(internal::scalar_multiple_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator/ (Scalar rhs) const { + // EIGEN_STATIC_ASSERT(!std::numeric_limits::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_quotient1_op(rhs)); } EIGEN_DEVICE_FUNC @@ -110,86 +151,106 @@ class TensorBase return cwiseMin(constant(threshold)); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp - unaryExpr(const CustomUnaryOp& func) const { - return TensorCwiseUnaryOp(derived(), func); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> cast() const { - return derived(); + return unaryExpr(internal::scalar_cast_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); } // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator+(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_sum_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator-(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_difference_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator*(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_product_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator/(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_quotient_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_max_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); } // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator<(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::less()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator<=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::less_equal()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator>(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::greater()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator>=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::greater_equal()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator==(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::equal_to()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::not_equal_to()); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } // Contractions. @@ -208,29 +269,72 @@ class TensorBase return TensorConvolutionOp(derived(), kernel.derived(), dims); } - // Coefficient-wise ternary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { - return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); - } - // Reductions. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> sum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const array, const Derived> + sum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); } + + const TensorReductionOp, const array, const Derived> + mean() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const array, const Derived> + prod() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + const TensorReductionOp, const array, const Derived> + maximum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MaxReducer()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); } + + const TensorReductionOp, const array, const Derived> + minimum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MinReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReductionOp reduce(const Dims& dims, const Reducer& reducer) const { @@ -258,17 +362,44 @@ class TensorBase template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorImagePatchOp extract_image_patches() const { - return TensorImagePatchOp(derived(), Rows, Cols, 1, 1); + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1, PADDING_SAME); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1, padding_type); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index stride, const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), Rows, Cols, stride, stride, padding_type); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorImagePatchOp extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride = 1, const Index col_stride = 1) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride); + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + PADDING_SAME); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + padding_type); } // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { @@ -279,10 +410,20 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp chip(const Index offset) const { - return TensorChippingOp(derived(), offset); + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorPaddingOp @@ -308,21 +449,24 @@ class TensorBase protected: template friend class Tensor; + template friend class TensorVarDim; template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; - template class TensorBase : public TensorBase { public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::Index Index; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; template friend class Tensor; + template friend class TensorVarDim; template friend class TensorBase; EIGEN_DEVICE_FUNC @@ -337,24 +481,43 @@ class TensorBase : public TensorBaserandom(); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() + other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() - other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() * other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() / other.derived(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp reshape(const NewDimensions& newDimensions) const { @@ -365,16 +528,26 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp chip(const Index offset) const { - return TensorChippingOp(derived(), offset); + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { return TensorShufflingOp(derived(), shuffle); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } // Select the device on which to evaluate the expression. template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 8cb41aec8..ef134adf2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -30,6 +30,8 @@ struct traits > : public traits::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -91,6 +93,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -103,11 +106,20 @@ struct TensorEvaluator, Device> m_dimensions[i] = input_dims[i] * broadcast[i]; } - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } } } @@ -125,16 +137,30 @@ struct TensorEvaluator, Device> m_impl.cleanup(); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (Layout == ColMajor) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq()(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } } index -= idx * m_outputStrides[i]; } @@ -142,15 +168,59 @@ struct TensorEvaluator, Device> eigen_assert(index < m_impl.dimensions()[0]); inputIndex += index; } else { - inputIndex += (index % m_impl.dimensions()[0]); + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } } return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (Layout == ColMajor) { + return packetColMajor(index); + } else { + return packetRowMajor(index); + } + } + // Ignore the LoadMode and always use unaligned loads since we can't guarantee // the alignment at compile time. template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -161,10 +231,15 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq()(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } } index -= idx * m_outputStrides[i]; } @@ -173,7 +248,12 @@ struct TensorEvaluator, Device> eigen_assert(index < m_impl.dimensions()[0]); innermostLoc = index; } else { - innermostLoc = index % m_impl.dimensions()[0]; + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } } inputIndex += innermostLoc; @@ -185,13 +265,67 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { - values[i] = coeff(originalIndex+i); + values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); return rslt; } } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b862a8fd3..bc336e488 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -21,34 +21,61 @@ namespace Eigen { */ namespace internal { -template +template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; }; -template +template struct eval, Eigen::Dense> { typedef const TensorChippingOp& type; }; -template +template struct nested, 1, typename eval >::type> { typedef TensorChippingOp type; }; +template +struct DimensionId +{ + DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId +{ + DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + } // end namespace internal -template +template class TensorChippingOp : public TensorBase > { public: @@ -61,34 +88,39 @@ class TensorChippingOp : public TensorBase > typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset) - : m_xpr(expr), m_offset(offset) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } protected: typename XprType::Nested m_xpr; const Index m_offset; + const internal::DimensionId m_dim; }; // Eval as rvalue -template +template struct TensorEvaluator, Device> { typedef TensorChippingOp XprType; @@ -96,41 +128,50 @@ struct TensorEvaluator, Device> static const int NumDims = NumInputDims-1; typedef typename XprType::Index Index; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. IsAligned = false, - PacketAccess = false, // not yet implemented + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { // We could also support the case where NumInputDims==1 if needed. EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(NumInputDims > DimId, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); int j = 0; for (int i = 0; i < NumInputDims; ++i) { - if (i != DimId) { + if (i != m_dim.actualDim()) { m_dimensions[j] = input_dims[i]; ++j; } } - m_stride = 1; - m_inputStride = 1; - for (int i = 0; i < DimId; ++i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - m_inputStride *= input_dims[DimId]; - m_inputOffset = m_stride * op.offset(); + m_stride = 1; + m_inputStride = 1; + if (Layout == ColMajor) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -150,16 +191,52 @@ struct TensorEvaluator, Device> return m_impl.coeff(srcCoeff(index)); } - /* to be done template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - }*/ + if ((Layout == ColMajor && m_dim.actualDim() == 0) || + (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims - 1) || + (Layout == RowMajor && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + packetSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); - if (DimId == NumDims && result) { + if (m_dim.actualDim() == NumDims && result) { return result + m_inputOffset; } else { return NULL; @@ -170,11 +247,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if (DimId == 0) { + if ((Layout == ColMajor && m_dim.actualDim() == 0) || + (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if (DimId == NumInputDims-1) { + } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) || + (Layout == RowMajor && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -192,12 +271,13 @@ struct TensorEvaluator, Device> Index m_inputOffset; Index m_inputStride; TensorEvaluator m_impl; + const internal::DimensionId m_dim; const Device& m_device; }; // Eval as lvalue -template +template struct TensorEvaluator, Device> : public TensorEvaluator, Device> { @@ -207,17 +287,17 @@ struct TensorEvaluator, Device> static const int NumDims = NumInputDims-1; typedef typename XprType::Index Index; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { IsAligned = false, - PacketAccess = false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -226,11 +306,45 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /* to be done template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - } */ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) || + (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < packetSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) || + (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + packetSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 74485b15b..fb4e7fb11 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -35,6 +35,8 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; enum { Flags = 0 }; }; @@ -103,11 +105,13 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(0 <= m_axis && m_axis < NumDims); const Dimensions& lhs_dims = m_leftImpl.dimensions(); @@ -127,13 +131,26 @@ struct TensorEvaluator= 0; --i) { + m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1]; + m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } } } @@ -159,25 +176,49 @@ struct TensorEvaluator subs; - for (int i = NumDims - 1; i > 0; --i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; } - subs[0] = index; const Dimensions& left_dims = m_leftImpl.dimensions(); if (subs[m_axis] < left_dims[m_axis]) { - Index left_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + Index left_index; + if (Layout == ColMajor) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } } return m_leftImpl.coeff(left_index); } else { subs[m_axis] -= left_dims[m_axis]; const Dimensions& right_dims = m_rightImpl.dimensions(); - Index right_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + Index right_index; + if (Layout == ColMajor) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } } return m_rightImpl.coeff(right_index); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 5851e5adc..e358e6a3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -93,10 +93,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = internal::max_n_1::size; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 50cb10a33..aecef3313 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -144,9 +144,9 @@ template struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename internal::promote_storage_type::ret Scalar; - typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::ret Scalar; + typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -155,6 +155,8 @@ struct traits > typedef typename KernelXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; enum { Flags = 0, @@ -227,11 +229,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -389,10 +397,6 @@ struct TensorEvaluator m_inputStride; array m_outputStride; @@ -421,7 +425,7 @@ struct GetKernelSize { } }; template <> -struct GetKernelSize { +struct GetKernelSize { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { return kernelSize; } @@ -610,11 +614,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -740,19 +750,17 @@ struct TensorEvaluator indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); switch(kernel_size) { case 4: { - EigenConvolutionKernel1D, Index, InputDims, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - EigenConvolutionKernel1D, Index, InputDims, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - EigenConvolutionKernel1D, Index, InputDims, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); break; } @@ -797,11 +805,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - EigenConvolutionKernel2D, Index, InputDims, 4, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -810,23 +818,21 @@ struct TensorEvaluator, Index, InputDims, 7, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - EigenConvolutionKernel2D, Index, InputDims, 7, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - EigenConvolutionKernel2D, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); break; } @@ -858,9 +864,7 @@ struct TensorEvaluator kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - EigenConvolutionKernel3D, Index, InputDims> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index ce9d73578..93ebbe277 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -25,11 +25,14 @@ struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -60,24 +63,24 @@ class TensorEvalToOp : public TensorBase > typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; } + EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; } protected: typename XprType::Nested m_xpr; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; @@ -93,6 +96,8 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -103,12 +108,12 @@ struct TensorEvaluator, Device> } typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -117,7 +122,7 @@ struct TensorEvaluator, Device> m_buffer[i] = m_impl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -135,12 +140,12 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_impl; const Device& m_device; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index cb14cc7f7..a9501336e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -25,11 +25,14 @@ struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; typedef typename traits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -59,8 +62,8 @@ class TensorForcedEvalOp : public TensorBase > typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -88,6 +91,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -100,10 +104,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_impl.evalSubExprsIfNeeded(NULL); - m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); - + const Index numValues = m_impl.dimensions().TotalSize(); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } typedef TensorEvalToOp EvalTo; EvalTo evalToTmp(m_buffer, m_op); internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); @@ -132,7 +142,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; const ArgType m_op; const Device& m_device; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 85599ccfd..7bec2b10a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -29,9 +29,11 @@ template cla template class TensorPatchOp; template class TensorImagePatchOp; template class TensorBroadcastingOp; -template class TensorChippingOp; +template class TensorChippingOp; template class TensorReshapingOp; +template class TensorLayoutSwapOp; template class TensorSlicingOp; +template class TensorReverseOp; template class TensorPaddingOp; template class TensorShufflingOp; template class TensorStridingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0dfb6913b..585ebc778 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -37,6 +37,8 @@ struct traits > : public traits typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; }; template @@ -53,8 +55,6 @@ struct nested, 1, typename eval class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> { @@ -69,9 +69,11 @@ class TensorImagePatchOp : public TensorBase::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides) + DenseIndex row_strides, DenseIndex col_strides, + PaddingType padding_type) : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides){} + m_row_strides(row_strides), m_col_strides(col_strides), + m_padding_type(padding_type) {} EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } @@ -81,6 +83,8 @@ class TensorImagePatchOp : public TensorBase::type& @@ -92,6 +96,7 @@ class TensorImagePatchOp : public TensorBase, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = NumDims == 5, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // We only support same strides for both dimensions and square patches. + eigen_assert(m_row_strides == m_col_strides); + + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast(m_row_strides)); + m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + case PADDING_SAME: + m_outputRows = ceil(m_inputRows / static_cast(m_row_strides)); + m_outputCols = ceil(m_inputCols / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + + // Dimensions for result of extraction. + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). m_dimensions[0] = input_dims[0]; m_dimensions[1] = op.patch_rows(); m_dimensions[2] = op.patch_cols(); - m_dimensions[3] = ceilf(static_cast(input_dims[1]) / op.row_strides()) * - ceilf(static_cast(input_dims[2]) / op.col_strides()); + m_dimensions[3] = m_outputRows * m_outputCols; for (int i = 4; i < NumDims; ++i) { m_dimensions[i] = input_dims[i-1]; } + // Strides for moving the patch in various dimensions. m_colStride = m_dimensions[1]; m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; m_otherStride = m_patchStride * m_dimensions[3]; - m_inputRows = input_dims[1]; - m_inputCols = input_dims[2]; - - m_rowInputStride = input_dims[0] * op.row_strides(); - m_colInputStride = input_dims[0] * input_dims[1] * op.col_strides(); + // Strides for navigating through the input tensor. + m_rowInputStride = input_dims[0]; + m_colInputStride = input_dims[0] * input_dims[1]; m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2]; - m_rowPaddingTop = op.patch_rows() / 2; - m_colPaddingLeft = op.patch_cols() / 2; - + // Fast representations of different variables. m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); m_fastColStride = internal::TensorIntDivisor(m_colStride); - m_fastInputRows = internal::TensorIntDivisor(m_inputRows); + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); m_fastDimZero = internal::TensorIntDivisor(m_dimensions[0]); } @@ -162,26 +205,29 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - // Find the location of the first element of the patch. + // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; // Find the offset of the element wrt the location of the first element. const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero; + // Other ways to index this element. const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colIndex = patch2DIndex / m_fastOutputRows; const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex + colOffset - m_colPaddingLeft; + // Calculate col index in the input original tensor. + const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft; if (inputCol < 0 || inputCol >= m_inputCols) { return Scalar(0); } - const Index rowIndex = patch2DIndex - colIndex * m_inputRows; // m_rowStride is always 1 + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; const Index rowOffset = patchOffset - colOffset * m_colStride; - const Index inputRow = rowIndex + rowOffset - m_rowPaddingTop; + // Calculate row index in the original input tensor. + const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop; if (inputRow < 0 || inputRow >= m_inputRows) { return Scalar(0); } @@ -214,20 +260,24 @@ struct TensorEvaluator, Device> const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colIndex = patch2DIndex / m_fastOutputRows; const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - const Index inputCols[2] = {colIndex + colOffsets[0] - m_colPaddingLeft, colIndex + colOffsets[1] - m_colPaddingLeft}; + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { // all zeros return internal::pset1(Scalar(0)); } if (inputCols[0] == inputCols[1]) { - const Index rowIndex = patch2DIndex - colIndex * m_inputRows; + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; eigen_assert(rowOffsets[0] <= rowOffsets[1]); - const Index inputRows[2] = {rowIndex + rowOffsets[0] - m_rowPaddingTop, rowIndex + rowOffsets[1] - m_rowPaddingTop}; + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { // all zeros @@ -247,6 +297,43 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } + const TensorEvaluator& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + // Location of the first element of the patch. + // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches + const Index patchIndex = coords[3]; + + array inputCoords; + inputCoords[0] = coords[0]; // depth + inputCoords[1] = patchIndex / m_inputCols + coords[1] - m_rowPaddingTop; + inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft; + inputCoords[3] = coords[4]; // batch + // If the computed coordinates are outside the original image perimeter, return 0. + if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows || + inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) { + return Scalar(0); + } + if (TensorEvaluator::CoordAccess) { + return m_impl.coeff(inputCoords); + } else { + Index inputIndex = + inputCoords[3] * m_patchInputStride + + inputCoords[2] * m_colInputStride + + inputCoords[1] * m_rowInputStride + + inputCoords[0]; + return m_impl.coeff(inputIndex); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { @@ -264,6 +351,8 @@ struct TensorEvaluator, Device> Index m_otherStride; Index m_patchStride; Index m_colStride; + Index m_row_strides; + Index m_col_strides; internal::TensorIntDivisor m_fastOtherStride; internal::TensorIntDivisor m_fastPatchStride; internal::TensorIntDivisor m_fastColStride; @@ -275,10 +364,13 @@ struct TensorEvaluator, Device> Index m_inputRows; Index m_inputCols; + Index m_outputRows; + Index m_outputCols; + Index m_rowPaddingTop; Index m_colPaddingLeft; - internal::TensorIntDivisor m_fastInputRows; + internal::TensorIntDivisor m_fastOutputRows; internal::TensorIntDivisor m_fastDimZero; TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 33849ed3e..23b595ac3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; }; template @@ -54,8 +57,8 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -96,11 +99,17 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) - { } + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -109,7 +118,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -127,7 +136,9 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - Scalar* data() const { return m_impl.data(); } + CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } protected: TensorEvaluator m_impl; @@ -148,6 +159,8 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -183,11 +196,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; }; template @@ -260,6 +276,8 @@ struct TensorEvaluator, Devi // slice offsets and sizes. IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -270,22 +288,30 @@ struct TensorEvaluator, Devi } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { + const Sizes& output_dims = op.sizes(); + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } else { - m_inputStrides[0] = 1; } - } - const Sizes& output_dims = op.sizes(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { + m_outputStrides[0] = 1; + m_fastOutputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } else { - m_outputStrides[0] = 1; - m_fastOutputStrides[0] = 1; + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + m_outputStrides[NumDims-1] = 1; + m_fastOutputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -299,14 +325,23 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; - for (int i = 0; i < NumDims; ++i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; + if (Layout == ColMajor) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } } } // Use memcpy if it's going to be faster than using the regular evaluation. @@ -340,16 +375,29 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; @@ -366,20 +414,44 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) + { + array inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return m_impl.coeff(inputCoords); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { Scalar* result = m_impl.data(); if (result) { Index offset = 0; - for (int i = 0; i < NumDims; ++i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i+1; j < NumDims; ++j) { - if (m_dimensions[j] > 1) { - return NULL; + if (Layout == ColMajor) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; } - offset += m_offsets[j] * m_inputStrides[j]; + break; } - break; } } return result + offset; @@ -391,12 +463,21 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); } - inputIndex += (index + m_offsets[0]); return inputIndex; } @@ -422,6 +503,8 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -445,16 +528,29 @@ struct TensorEvaluator, Device> const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); } - inputIndices[0] += (indices[0] + this->m_offsets[0]); - inputIndices[1] += (indices[1] + this->m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { this->m_impl.template writePacket(inputIndices[0], x); } @@ -468,6 +564,15 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) + { + array inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return this->m_impl.coeffRef(inputCoords); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index d6347b054..9b14e01f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -88,6 +91,8 @@ struct TensorEvaluator, Device enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = true, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -99,13 +104,23 @@ struct TensorEvaluator, Device m_dimensions[i] += m_padding[i].first + m_padding[i].second; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -126,23 +141,84 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { return Scalar(0); } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); + inputIndex += (index - m_padding[0].first); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (index < m_padding[NumDims-1].first || + index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[NumDims-1].first); } - inputIndex += (index - m_padding[0].first); return m_impl.coeff(inputIndex); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (Layout == ColMajor) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + Index inputIndex; + if (Layout == ColMajor) { + const Index idx = coords[0]; + if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[0].first; + for (int i = 1; i < NumDims; ++i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } else { + const Index idx = coords[NumDims-1]; + if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[NumDims-1].first; + for (int i = NumDims - 2; i >= 0; --i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } + return m_impl.coeff(inputIndex); + } + + Scalar* data() const { return NULL; } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -200,9 +276,64 @@ struct TensorEvaluator, Device return packetWithPossibleZero(initialIndex); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - protected: + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index e2fe32d67..1c03d202f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; }; template @@ -89,11 +92,16 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - }; + Layout = TensorEvaluator::Layout, + CoordAccess = true, + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const PatchDim& patch_dims = op.patch_dims(); @@ -195,6 +203,35 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + // Location of the first element of the patch. + const Index patchIndex = coords[NumDims - 1]; + + if (TensorEvaluator::CoordAccess) { + array inputCoords; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputCoords[i] = coords[i] + patchIdx; + } + inputCoords[0] = (patchIndex + coords[0]); + return m_impl.coeff(inputCoords); + } + else { + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + coords[0]); + return m_impl.coeff(inputIndex); + } + } + Scalar* data() const { return NULL; } protected: @@ -206,7 +243,6 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; }; - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 831a9f005..ab5fc6a69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -99,6 +102,8 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -112,15 +117,22 @@ struct TensorEvaluator, Device> array inputStrides; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - inputStrides[0] = 1; - m_outputStrides[0] = 1; + if (Layout == ColMajor) { + inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } } + for (int i = 0; i < NumDims; ++i) { m_inputStrides[i] = inputStrides[shuffle[i]]; } @@ -162,15 +174,23 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; } - return inputIndex + index * m_inputStrides[0]; } Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index ecfdb762c..2fbdfadfe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -98,6 +101,8 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,14 +114,25 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_inputStrides[i-1] *= op.strides()[i-1]; + if (Layout == ColMajor) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; + } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_inputStrides[i+1] *= op.strides()[i+1]; + } + m_inputStrides[0] *= op.strides()[0]; } - m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -135,14 +151,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[0]; - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } template @@ -154,16 +163,29 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; - inputIndices[0] += idx0 * m_inputStrides[i]; - inputIndices[1] += idx1 * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - inputIndices[0] += indices[0] * m_inputStrides[0]; - inputIndices[1] += indices[1] * m_inputStrides[0]; if (inputIndices[1] - inputIndices[0] == packetSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; @@ -183,6 +205,27 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims-1]; + } + return inputIndex; + } + Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; @@ -190,6 +233,84 @@ struct TensorEvaluator, Device> }; +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef TensorEvaluator Base; + // typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + // typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT Scalar values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 5c0f78489..022d20360 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -50,6 +50,8 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + static const int NumDimensions = NumIndices_; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, Flags = compute_tensor_flags::ret | LvalueBit, @@ -63,6 +65,8 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + static const int NumDimensions = array_size::value; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, Flags = compute_tensor_flags::ret | LvalueBit, @@ -78,6 +82,8 @@ struct traits > typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; enum { Options = Options_, Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), @@ -92,6 +98,8 @@ struct traits > typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), @@ -198,6 +206,51 @@ struct nested, 1, typename eval Date: Wed, 14 Jan 2015 15:43:38 -0800 Subject: Updated the list of include files --- unsupported/Eigen/CXX11/Tensor | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index aa26e5283..34107ae71 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,13 +30,20 @@ #include #include +#if __cplusplus > 199711 +#include +#endif + #ifdef EIGEN_USE_THREADS #include #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#ifdef EIGEN_USE_GPU +#include +#if defined(__CUDACC__) #include #endif +#endif #include "Eigen/Core" @@ -44,6 +51,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" @@ -55,15 +63,17 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" @@ -77,7 +87,6 @@ #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" - #include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" -- cgit v1.2.3 From b5124e7cfda27ed99dcfcec8cb1b674efa1ef4a3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:46:04 -0800 Subject: Created many additional tests --- unsupported/test/CMakeLists.txt | 13 +- unsupported/test/cxx11_tensor_assign.cpp | 73 ++++ unsupported/test/cxx11_tensor_broadcasting.cpp | 86 ++++- unsupported/test/cxx11_tensor_chipping.cpp | 183 ++++++--- unsupported/test/cxx11_tensor_concatenation.cpp | 34 +- unsupported/test/cxx11_tensor_contract_cuda.cpp | 121 ++++++ unsupported/test/cxx11_tensor_contraction.cpp | 221 +++++++---- unsupported/test/cxx11_tensor_cuda.cpp | 474 ++++++++++++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 118 +++--- unsupported/test/cxx11_tensor_dimension.cpp | 9 +- unsupported/test/cxx11_tensor_expr.cpp | 40 ++ unsupported/test/cxx11_tensor_forced_eval.cpp | 27 ++ unsupported/test/cxx11_tensor_image_patch.cpp | 206 +++++++++- unsupported/test/cxx11_tensor_map.cpp | 7 +- unsupported/test/cxx11_tensor_morphing.cpp | 143 +++++-- unsupported/test/cxx11_tensor_of_strings.cpp | 54 +-- unsupported/test/cxx11_tensor_padding.cpp | 23 +- unsupported/test/cxx11_tensor_patch.cpp | 17 + unsupported/test/cxx11_tensor_reduction.cpp | 287 ++++++++++++-- unsupported/test/cxx11_tensor_shuffling.cpp | 28 +- unsupported/test/cxx11_tensor_simple.cpp | 3 + unsupported/test/cxx11_tensor_striding.cpp | 38 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 70 ++-- 23 files changed, 1908 insertions(+), 367 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_contract_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_cuda.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 89c651804..9f44e47f9 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -99,7 +99,7 @@ if(EIGEN_TEST_CXX11) # older compiler that don't support cxx11. ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") - ei_add_test(cxx11_tensor_symmetry "-std=c++0x") +# ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_index_list "-std=c++0x") @@ -126,8 +126,17 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") -# ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") ei_add_test(cxx11_tensor_ref "-std=c++0x") + ei_add_test(cxx11_tensor_random "-std=c++0x") + ei_add_test(cxx11_tensor_casts "-std=c++0x") + ei_add_test(cxx11_tensor_reverse "-std=c++0x") + ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") ei_add_test(cxx11_tensor_io "-std=c++0x") + + # These tests needs nvcc +# ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_cuda "-std=c++0x") +# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") + endif() diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index 0ac3f9bf9..d16aaf847 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -285,6 +285,78 @@ static void test_compound_assign() } } +static void test_std_initializers_tensor() { +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + Tensor a(3); + a.setValues({0, 1, 2}); + VERIFY_IS_EQUAL(a(0), 0); + VERIFY_IS_EQUAL(a(1), 1); + VERIFY_IS_EQUAL(a(2), 2); + + // It fills the top-left slice. + a.setValues({10, 20}); + VERIFY_IS_EQUAL(a(0), 10); + VERIFY_IS_EQUAL(a(1), 20); + VERIFY_IS_EQUAL(a(2), 2); + + // Chaining. + Tensor a2(3); + a2 = a.setValues({100, 200, 300}); + VERIFY_IS_EQUAL(a(0), 100); + VERIFY_IS_EQUAL(a(1), 200); + VERIFY_IS_EQUAL(a(2), 300); + VERIFY_IS_EQUAL(a2(0), 100); + VERIFY_IS_EQUAL(a2(1), 200); + VERIFY_IS_EQUAL(a2(2), 300); + + Tensor b(2, 3); + b.setValues({{0, 1, 2}, {3, 4, 5}}); + VERIFY_IS_EQUAL(b(0, 0), 0); + VERIFY_IS_EQUAL(b(0, 1), 1); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 3); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + // It fills the top-left slice. + b.setValues({{10, 20}, {30}}); + VERIFY_IS_EQUAL(b(0, 0), 10); + VERIFY_IS_EQUAL(b(0, 1), 20); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 30); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + Eigen::Tensor c(3, 2, 4); + c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}}, + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{20, 21, 22, 23}, {24, 25, 26, 27}}}); + VERIFY_IS_EQUAL(c(0, 0, 0), 0); + VERIFY_IS_EQUAL(c(0, 0, 1), 1); + VERIFY_IS_EQUAL(c(0, 0, 2), 2); + VERIFY_IS_EQUAL(c(0, 0, 3), 3); + VERIFY_IS_EQUAL(c(0, 1, 0), 4); + VERIFY_IS_EQUAL(c(0, 1, 1), 5); + VERIFY_IS_EQUAL(c(0, 1, 2), 6); + VERIFY_IS_EQUAL(c(0, 1, 3), 7); + VERIFY_IS_EQUAL(c(1, 0, 0), 10); + VERIFY_IS_EQUAL(c(1, 0, 1), 11); + VERIFY_IS_EQUAL(c(1, 0, 2), 12); + VERIFY_IS_EQUAL(c(1, 0, 3), 13); + VERIFY_IS_EQUAL(c(1, 1, 0), 14); + VERIFY_IS_EQUAL(c(1, 1, 1), 15); + VERIFY_IS_EQUAL(c(1, 1, 2), 16); + VERIFY_IS_EQUAL(c(1, 1, 3), 17); + VERIFY_IS_EQUAL(c(2, 0, 0), 20); + VERIFY_IS_EQUAL(c(2, 0, 1), 21); + VERIFY_IS_EQUAL(c(2, 0, 2), 22); + VERIFY_IS_EQUAL(c(2, 0, 3), 23); + VERIFY_IS_EQUAL(c(2, 1, 0), 24); + VERIFY_IS_EQUAL(c(2, 1, 1), 25); + VERIFY_IS_EQUAL(c(2, 1, 2), 26); + VERIFY_IS_EQUAL(c(2, 1, 3), 27); +#endif // EIGEN_HAS_VARIADIC_TEMPLATES +} void test_cxx11_tensor_assign() { @@ -294,4 +366,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); CALL_SUBTEST(test_compound_assign()); + CALL_SUBTEST(test_std_initializers_tensor()); } diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 9663912a4..f0792bdcf 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_broadcasting() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array broadcasts; broadcasts[0] = 1; @@ -23,7 +24,7 @@ static void test_simple_broadcasting() broadcasts[2] = 1; broadcasts[3] = 1; - Tensor no_broadcast; + Tensor no_broadcast; no_broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2); @@ -45,7 +46,7 @@ static void test_simple_broadcasting() broadcasts[1] = 3; broadcasts[2] = 1; broadcasts[3] = 4; - Tensor broadcast; + Tensor broadcast; broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(broadcast.dimension(0), 4); @@ -65,16 +66,17 @@ static void test_simple_broadcasting() } +template static void test_vectorized_broadcasting() { - Tensor tensor(8,3,5); + Tensor tensor(8,3,5); tensor.setRandom(); array broadcasts; broadcasts[0] = 2; broadcasts[1] = 3; broadcasts[2] = 4; - Tensor broadcast; + Tensor broadcast; broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(broadcast.dimension(0), 16); @@ -107,8 +109,78 @@ static void test_vectorized_broadcasting() } +template +static void test_static_broadcasting() +{ + Tensor tensor(8,3,5); + tensor.setRandom(); + Eigen::IndexList, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; + + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +template +static void test_fixed_size_broadcasting() +{ + // Need to add a [] operator to the Size class for this to work +#if 0 + Tensor t1(10); + t1.setRandom(); + TensorFixedSize, DataLayout> t2; + t2 = t2.constant(20.0f); + + Tensor t3 = t1 + t2.broadcast(Eigen::array{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } + + TensorMap, DataLayout> > t4(t2.data(), {{1}}); + Tensor t5 = t1 + t4.broadcast(Eigen::array{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); + } +#endif +} + + void test_cxx11_tensor_broadcasting() { - CALL_SUBTEST(test_simple_broadcasting()); - CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_static_broadcasting()); + CALL_SUBTEST(test_static_broadcasting()); + CALL_SUBTEST(test_fixed_size_broadcasting()); + CALL_SUBTEST(test_fixed_size_broadcasting()); } diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 0027b2888..0de7bbac6 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -13,18 +13,20 @@ using Eigen::Tensor; - +template static void test_simple_chip() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - Tensor chip1; - chip1 = tensor.chip<0>(1); + Tensor chip1; + chip1 = tensor.template chip<0>(1); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); VERIFY_IS_EQUAL(chip1.dimension(1), 5); VERIFY_IS_EQUAL(chip1.dimension(2), 7); VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { @@ -35,7 +37,7 @@ static void test_simple_chip() } } - Tensor chip2 = tensor.chip<1>(1); + Tensor chip2 = tensor.template chip<1>(1); VERIFY_IS_EQUAL(chip2.dimension(0), 2); VERIFY_IS_EQUAL(chip2.dimension(1), 5); VERIFY_IS_EQUAL(chip2.dimension(2), 7); @@ -50,7 +52,7 @@ static void test_simple_chip() } } - Tensor chip3 = tensor.chip<2>(2); + Tensor chip3 = tensor.template chip<2>(2); VERIFY_IS_EQUAL(chip3.dimension(0), 2); VERIFY_IS_EQUAL(chip3.dimension(1), 3); VERIFY_IS_EQUAL(chip3.dimension(2), 7); @@ -65,7 +67,7 @@ static void test_simple_chip() } } - Tensor chip4(tensor.chip<3>(5)); + Tensor chip4(tensor.template chip<3>(5)); VERIFY_IS_EQUAL(chip4.dimension(0), 2); VERIFY_IS_EQUAL(chip4.dimension(1), 3); VERIFY_IS_EQUAL(chip4.dimension(2), 5); @@ -80,7 +82,7 @@ static void test_simple_chip() } } - Tensor chip5(tensor.chip<4>(7)); + Tensor chip5(tensor.template chip<4>(7)); VERIFY_IS_EQUAL(chip5.dimension(0), 2); VERIFY_IS_EQUAL(chip5.dimension(1), 3); VERIFY_IS_EQUAL(chip5.dimension(2), 5); @@ -96,14 +98,97 @@ static void test_simple_chip() } } +template +static void test_dynamic_chip() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor chip1; + chip1 = tensor.chip(1, 0); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor chip2 = tensor.chip(1, 1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor chip3 = tensor.chip(2, 2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor chip4(tensor.chip(5, 3)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor chip5(tensor.chip(7, 4)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} +template static void test_chip_in_expr() { - Tensor input1(2,3,5,7,11); + Tensor input1(2,3,5,7,11); input1.setRandom(); - Tensor input2(3,5,7,11); + Tensor input2(3,5,7,11); input2.setRandom(); - Tensor result = input1.chip<0>(0) + input2; + Tensor result = input1.template chip<0>(0) + input2; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { @@ -115,9 +200,9 @@ static void test_chip_in_expr() { } } - Tensor input3(3,7,11); + Tensor input3(3,7,11); input3.setRandom(); - Tensor result2 = input1.chip<0>(0).chip<1>(2) + input3; + Tensor result2 = input1.template chip<0>(0).template chip<1>(2) + input3; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 7; ++j) { for (int k = 0; k < 11; ++k) { @@ -128,16 +213,16 @@ static void test_chip_in_expr() { } } - +template static void test_chip_as_lvalue() { - Tensor input1(2,3,5,7,11); + Tensor input1(2,3,5,7,11); input1.setRandom(); - Tensor input2(3,5,7,11); + Tensor input2(3,5,7,11); input2.setRandom(); - Tensor tensor = input1; - tensor.chip<0>(1) = input2; + Tensor tensor = input1; + tensor.template chip<0>(1) = input2; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -154,10 +239,10 @@ static void test_chip_as_lvalue() } } - Tensor input3(2,5,7,11); + Tensor input3(2,5,7,11); input3.setRandom(); tensor = input1; - tensor.chip<1>(1) = input3; + tensor.template chip<1>(1) = input3; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -174,10 +259,10 @@ static void test_chip_as_lvalue() } } - Tensor input4(2,3,7,11); + Tensor input4(2,3,7,11); input4.setRandom(); tensor = input1; - tensor.chip<2>(3) = input4; + tensor.template chip<2>(3) = input4; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -194,10 +279,10 @@ static void test_chip_as_lvalue() } } - Tensor input5(2,3,5,11); + Tensor input5(2,3,5,11); input5.setRandom(); tensor = input1; - tensor.chip<3>(4) = input5; + tensor.template chip<3>(4) = input5; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -214,10 +299,10 @@ static void test_chip_as_lvalue() } } - Tensor input6(2,3,5,7); + Tensor input6(2,3,5,7); input6.setRandom(); tensor = input1; - tensor.chip<4>(5) = input6; + tensor.template chip<4>(5) = input6; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -235,47 +320,57 @@ static void test_chip_as_lvalue() } } - +template static void test_chip_raw_data() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; - auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice()); + typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; + auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice()); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { for (int l = 0; l < 7; ++l) { - int chip_index = i + 2 * (j + 3 * (k + 5 * l)); + int chip_index; + if (DataLayout == ColMajor) { + chip_index = i + 2 * (j + 3 * (k + 5 * l)); + } else { + chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i))); + } VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); } } } } - typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; - auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; + auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip0.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; - auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip1.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; - auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip2.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; - auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); } - void test_cxx11_tensor_chipping() { - CALL_SUBTEST(test_simple_chip()); - CALL_SUBTEST(test_chip_in_expr()); - CALL_SUBTEST(test_chip_as_lvalue()); - CALL_SUBTEST(test_chip_raw_data()); + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_dynamic_chip()); + CALL_SUBTEST(test_dynamic_chip()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_raw_data()); + CALL_SUBTEST(test_chip_raw_data()); } diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp index 8fd4f5f80..9fdf33c16 100644 --- a/unsupported/test/cxx11_tensor_concatenation.cpp +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -13,15 +13,16 @@ using Eigen::Tensor; +template static void test_dimension_failures() { - Tensor left(2, 3, 1); - Tensor right(3, 3, 1); + Tensor left(2, 3, 1); + Tensor right(3, 3, 1); left.setRandom(); right.setRandom(); // Okay; other dimensions are equal. - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); // Dimension mismatches. VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1)); @@ -32,33 +33,35 @@ static void test_dimension_failures() VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1)); } +template static void test_static_dimension_failure() { - Tensor left(2, 3); - Tensor right(2, 3, 1); + Tensor left(2, 3); + Tensor right(2, 3, 1); #ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE // Technically compatible, but we static assert that the inputs have same // NumDims. - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); #endif // This can be worked around in this case. - Tensor concatenation = left + Tensor concatenation = left .reshape(Tensor::Dimensions{{2, 3, 1}}) .concatenate(right, 0); - Tensor alternative = left + Tensor alternative = left .concatenate(right.reshape(Tensor::Dimensions{{2, 3}}), 0); } +template static void test_simple_concatenation() { - Tensor left(2, 3, 1); - Tensor right(2, 3, 1); + Tensor left(2, 3, 1); + Tensor right(2, 3, 1); left.setRandom(); right.setRandom(); - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); VERIFY_IS_EQUAL(concatenation.dimension(0), 4); VERIFY_IS_EQUAL(concatenation.dimension(1), 3); VERIFY_IS_EQUAL(concatenation.dimension(2), 1); @@ -103,8 +106,11 @@ static void test_simple_concatenation() void test_cxx11_tensor_concatenation() { - CALL_SUBTEST(test_dimension_failures()); - CALL_SUBTEST(test_static_dimension_failure()); - CALL_SUBTEST(test_simple_concatenation()); + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_simple_concatenation()); + CALL_SUBTEST(test_simple_concatenation()); // CALL_SUBTEST(test_vectorized_concatenation()); } diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp new file mode 100644 index 000000000..9599607c6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp @@ -0,0 +1,121 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2014 Navdeep Jaitly +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +typedef Tensor::DimensionPair DimPair; + +template +static void test_cuda_contraction(int m_size, int k_size, int n_size) +{ + cout<<"Calling with ("< t_left(Eigen::array(m_size, k_size)); + Tensor t_right(Eigen::array(k_size, n_size)); + Tensor t_result(Eigen::array(m_size, n_size)); + Tensor t_result_gpu(Eigen::array(m_size, n_size)); + Eigen::array dims(DimPair(1, 0)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, Eigen::array(m_size, k_size)); + Eigen::TensorMap > + gpu_t_right(d_t_right, Eigen::array(k_size, n_size)); + Eigen::TensorMap > + gpu_t_result(d_t_result, Eigen::array(m_size, n_size)); + + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << endl; + assert(false); + } + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_cuda() +{ + cout<<"Calling contraction tests"<(128, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + } + + int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i <15; i++) + for (int j = 0; j < 15; j++) + for (int k = 0; k < 17; k++) { + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + } +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 17bd335f7..6124818fd 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -16,18 +16,18 @@ using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; - +template static void test_evals() { - Tensor mat1(2, 3); - Tensor mat2(2, 3); - Tensor mat3(3, 2); + Tensor mat1(2, 3); + Tensor mat2(2, 3); + Tensor mat3(3, 2); mat1.setRandom(); mat2.setRandom(); mat3.setRandom(); - Tensor mat4(3,3); + Tensor mat4(3,3); mat4.setZero(); Eigen::array dims3({{DimPair(0, 0)}}); typedef TensorEvaluator Evaluator; @@ -47,7 +47,7 @@ static void test_evals() VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1)); VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2)); - Tensor mat5(2,2); + Tensor mat5(2,2); mat5.setZero(); Eigen::array dims4({{DimPair(1, 1)}}); typedef TensorEvaluator Evaluator2; @@ -62,7 +62,7 @@ static void test_evals() VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2)); VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2)); - Tensor mat6(2,2); + Tensor mat6(2,2); mat6.setZero(); Eigen::array dims6({{DimPair(1, 0)}}); typedef TensorEvaluator Evaluator3; @@ -78,16 +78,16 @@ static void test_evals() VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1)); } - +template static void test_scalar() { - Tensor vec1({6}); - Tensor vec2({6}); + Tensor vec1({6}); + Tensor vec2({6}); vec1.setRandom(); vec2.setRandom(); - Tensor scalar(1); + Tensor scalar(1); scalar.setZero(); Eigen::array dims({{DimPair(0, 0)}}); typedef TensorEvaluator Evaluator; @@ -102,16 +102,16 @@ static void test_scalar() VERIFY_IS_APPROX(scalar(0), expected); } - +template static void test_multidims() { - Tensor mat1(2, 2, 2); - Tensor mat2(2, 2, 2, 2); + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2, 2, 2); + Tensor mat3(2, 2, 2); mat3.setZero(); Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); typedef TensorEvaluator Evaluator; @@ -140,15 +140,15 @@ static void test_multidims() mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); } - +template static void test_holes() { - Tensor t1(2, 5, 7, 3); - Tensor t2(2, 7, 11, 13, 3); + Tensor t1(2, 5, 7, 3); + Tensor t2(2, 7, 11, 13, 3); t1.setRandom(); t2.setRandom(); Eigen::array dims({{DimPair(0, 0), DimPair(3, 4)}}); - Tensor result = t1.contract(t2, dims); + Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 5); VERIFY_IS_EQUAL(result.dimension(1), 7); VERIFY_IS_EQUAL(result.dimension(2), 7); @@ -174,16 +174,16 @@ static void test_holes() { } } - +template static void test_full_redux() { - Tensor t1(2, 2); - Tensor t2(2, 2, 2); + Tensor t1(2, 2); + Tensor t2(2, 2, 2); t1.setRandom(); t2.setRandom(); Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); - Tensor result = t1.contract(t2, dims); + Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) + t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0)); @@ -200,13 +200,13 @@ static void test_full_redux() + t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1)); } - +template static void test_contraction_of_contraction() { - Tensor t1(2, 2); - Tensor t2(2, 2); - Tensor t3(2, 2); - Tensor t4(2, 2); + Tensor t1(2, 2); + Tensor t2(2, 2); + Tensor t3(2, 2); + Tensor t4(2, 2); t1.setRandom(); t2.setRandom(); t3.setRandom(); @@ -216,30 +216,32 @@ static void test_contraction_of_contraction() auto contract1 = t1.contract(t2, dims); auto diff = t3 - contract1; auto contract2 = t1.contract(t4, dims); - Tensor result = contract2.contract(diff, dims); + Tensor result = contract2.contract(diff, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 2); - Eigen::Map m1(t1.data(), 2, 2); - Eigen::Map m2(t2.data(), 2, 2); - Eigen::Map m3(t3.data(), 2, 2); - Eigen::Map m4(t4.data(), 2, 2); - Eigen::MatrixXf expected = (m1 * m4) * (m3 - m1 * m2); + Eigen::Map> + m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2), + m4(t4.data(), 2, 2); + Eigen::Matrix + expected = (m1 * m4) * (m3 - m1 * m2); + VERIFY_IS_APPROX(result(0, 0), expected(0, 0)); VERIFY_IS_APPROX(result(0, 1), expected(0, 1)); VERIFY_IS_APPROX(result(1, 0), expected(1, 0)); VERIFY_IS_APPROX(result(1, 1), expected(1, 1)); } - +template static void test_expr() { - Tensor mat1(2, 3); - Tensor mat2(3, 2); + Tensor mat1(2, 3); + Tensor mat2(3, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2,2); + Tensor mat3(2,2); Eigen::array dims({{DimPair(1, 0)}}); mat3 = mat1.contract(mat2, dims); @@ -250,16 +252,16 @@ static void test_expr() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } - +template static void test_out_of_order_contraction() { - Tensor mat1(2, 2, 2); - Tensor mat2(2, 2, 2); + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2, 2); + Tensor mat3(2, 2); Eigen::array dims({{DimPair(2, 0), DimPair(0, 2)}}); mat3 = mat1.contract(mat2, dims); @@ -295,18 +297,18 @@ static void test_out_of_order_contraction() } - +template static void test_consistency() { // this does something like testing (A*B)^T = (B^T * A^T) - Tensor mat1(4, 3, 5); - Tensor mat2(3, 2, 1, 5, 4); + Tensor mat1(4, 3, 5); + Tensor mat2(3, 2, 1, 5, 4); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(5, 2, 1, 5); - Tensor mat4(2, 1, 5, 5); + Tensor mat3(5, 2, 1, 5); + Tensor mat4(2, 1, 5, 5); // contract on dimensions of size 4 and 3 Eigen::array dims1({{DimPair(0, 4), DimPair(1, 0)}}); @@ -316,27 +318,40 @@ static void test_consistency() mat4 = mat2.contract(mat1, dims2); // check that these are equal except for ordering of dimensions - for (size_t i = 0; i < 5; i++) { - for (size_t j = 0; j < 10; j++) { - VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + if (DataLayout == ColMajor) { + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + } + } + } else { + // Row major + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]); + } } } } - +template static void test_large_contraction() { - Tensor t_left(30, 50, 8, 31); - Tensor t_right(8, 31, 7, 20, 10); - Tensor t_result(30, 50, 7, 20, 10); + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); t_left.setRandom(); t_right.setRandom(); - typedef Map MapXf; + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 248); MapXf m_right(t_right.data(), 248, 1400); - MatrixXf m_result(1500, 1400); + Eigen::Matrix m_result(1500, 1400); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); @@ -351,20 +366,20 @@ static void test_large_contraction() } } - +template static void test_matrix_vector() { - Tensor t_left(30, 50); - Tensor t_right(50); - Tensor t_result(30); + Tensor t_left(30, 50); + Tensor t_right(50); + Tensor t_result(30); t_left.setRandom(); t_right.setRandom(); - typedef Map> MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 30, 50); MapXf m_right(t_right.data(), 50, 1); - Eigen::Matrix m_result(30, 1); + Eigen::Matrix m_result(30, 1); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims{{DimPair(1, 0)}}; @@ -379,18 +394,19 @@ static void test_matrix_vector() } +template static void test_tensor_vector() { - Tensor t_left(7, 13, 17); - Tensor t_right(1, 7); - typedef typename Tensor::DimensionPair DimensionPair; + Tensor t_left(7, 13, 17); + Tensor t_right(1, 7); + typedef typename Tensor::DimensionPair DimensionPair; Eigen::array dim_pair01{{{0, 1}}}; - Tensor t_result = t_left.contract(t_right, dim_pair01); + Tensor t_result = t_left.contract(t_right, dim_pair01); - typedef Map> MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 7, 13*17); MapXf m_right(t_right.data(), 1, 7); - Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); + Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); @@ -398,18 +414,63 @@ static void test_tensor_vector() } +template +static void test_small_blocking_factors() +{ + Tensor t_left(30, 5, 3, 31); + Tensor t_right(3, 31, 7, 20, 1); + t_left.setRandom(); + t_right.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + // Force the cache sizes, which results in smaller blocking factors. + Eigen::setCpuCacheSizes(896, 1920, 2944); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + Tensor t_result; + t_result = t_left.contract(t_right, dims); + + // compute result using a simple eigen matrix product + Map> m_left(t_left.data(), 150, 93); + Map> m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + void test_cxx11_tensor_contraction() { - CALL_SUBTEST(test_evals()); - CALL_SUBTEST(test_scalar()); - CALL_SUBTEST(test_multidims()); - CALL_SUBTEST(test_holes()); - CALL_SUBTEST(test_full_redux()); - CALL_SUBTEST(test_contraction_of_contraction()); - CALL_SUBTEST(test_expr()); - CALL_SUBTEST(test_out_of_order_contraction()); - CALL_SUBTEST(test_consistency()); - CALL_SUBTEST(test_large_contraction()); - CALL_SUBTEST(test_matrix_vector()); - CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_contraction_of_contraction()); + CALL_SUBTEST(test_contraction_of_contraction()); + CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_small_blocking_factors()); + CALL_SUBTEST(test_small_blocking_factors()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp new file mode 100644 index 000000000..059d23de1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// TODO(mdevin): Free the cuda memory. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_elementwise_small() { + Tensor in1(Eigen::array(2)); + Tensor in2(Eigen::array(2)); + Tensor out(Eigen::array(2)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, Eigen::array(2)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array(i)), + in1(Eigen::array(i)) + in2(Eigen::array(i))); + } +} + +void test_cuda_elementwise() +{ + Tensor in1(Eigen::array(72,53,97)); + Tensor in2(Eigen::array(72,53,97)); + Tensor in3(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_in3), in3_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in3(d_in3, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * in3(Eigen::array(i,j,k))); + } + } + } +} + + +void test_cuda_reduction() +{ + Tensor in1(Eigen::array(72,53,97,113)); + Tensor out(Eigen::array(72,97)); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97,113)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,97)); + + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max(expected, in1(Eigen::array(i, k, j, l))); + } + } + VERIFY_IS_APPROX(out(Eigen::array(i,j)), expected); + } + } +} + +template +static void test_cuda_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(Eigen::array(6, 50, 3, 31)); + Tensor t_right(Eigen::array(3, 31, 7, 20, 1)); + Tensor t_result(Eigen::array(6, 50, 7, 20, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, Eigen::array(6, 50, 3, 31)); + Eigen::TensorMap > + gpu_t_right(d_t_right, Eigen::array(3, 31, 7, 20, 1)); + Eigen::TensorMap > + gpu_t_result(d_t_result, Eigen::array(6, 50, 7, 20, 1)); + + typedef Eigen::Map > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result(300, 140); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + assert(false); + } + } +} + +static void test_cuda_convolution_1d() +{ + Tensor input(Eigen::array(74,37,11,137)); + Tensor kernel(Eigen::array(4)); + Tensor out(Eigen::array(74,34,11,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(4)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,34,11,137)); + + Eigen::array dims(1); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array(i,j,k,l)); + const float expected = input(Eigen::array(i,j+0,k,l)) * kernel(Eigen::array(0)) + + input(Eigen::array(i,j+1,k,l)) * kernel(Eigen::array(1)) + + input(Eigen::array(i,j+2,k,l)) * kernel(Eigen::array(2)) + + input(Eigen::array(i,j+3,k,l)) * kernel(Eigen::array(3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_2d() +{ + Tensor input(Eigen::array(74,37,11,137)); + Tensor kernel(Eigen::array(3,4)); + Tensor out(Eigen::array(74,35,8,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(3,4)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,35,8,137)); + + Eigen::array dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array(i,j,k,l)); + const float expected = input(Eigen::array(i,j+0,k+0,l)) * kernel(Eigen::array(0,0)) + + input(Eigen::array(i,j+1,k+0,l)) * kernel(Eigen::array(1,0)) + + input(Eigen::array(i,j+2,k+0,l)) * kernel(Eigen::array(2,0)) + + input(Eigen::array(i,j+0,k+1,l)) * kernel(Eigen::array(0,1)) + + input(Eigen::array(i,j+1,k+1,l)) * kernel(Eigen::array(1,1)) + + input(Eigen::array(i,j+2,k+1,l)) * kernel(Eigen::array(2,1)) + + input(Eigen::array(i,j+0,k+2,l)) * kernel(Eigen::array(0,2)) + + input(Eigen::array(i,j+1,k+2,l)) * kernel(Eigen::array(1,2)) + + input(Eigen::array(i,j+2,k+2,l)) * kernel(Eigen::array(2,2)) + + input(Eigen::array(i,j+0,k+3,l)) * kernel(Eigen::array(0,3)) + + input(Eigen::array(i,j+1,k+3,l)) * kernel(Eigen::array(1,3)) + + input(Eigen::array(i,j+2,k+3,l)) * kernel(Eigen::array(2,3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_3d() +{ + Tensor input(Eigen::array(74,37,11,137,17)); + Tensor kernel(Eigen::array(3,4,2)); + Tensor out(Eigen::array(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137,17)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(3,4,2)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,35,8,136,17)); + + Eigen::array dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(Eigen::array(i,j,k,l,m)); + const float expected = input(Eigen::array(i,j+0,k+0,l+0,m)) * kernel(Eigen::array(0,0,0)) + + input(Eigen::array(i,j+1,k+0,l+0,m)) * kernel(Eigen::array(1,0,0)) + + input(Eigen::array(i,j+2,k+0,l+0,m)) * kernel(Eigen::array(2,0,0)) + + input(Eigen::array(i,j+0,k+1,l+0,m)) * kernel(Eigen::array(0,1,0)) + + input(Eigen::array(i,j+1,k+1,l+0,m)) * kernel(Eigen::array(1,1,0)) + + input(Eigen::array(i,j+2,k+1,l+0,m)) * kernel(Eigen::array(2,1,0)) + + input(Eigen::array(i,j+0,k+2,l+0,m)) * kernel(Eigen::array(0,2,0)) + + input(Eigen::array(i,j+1,k+2,l+0,m)) * kernel(Eigen::array(1,2,0)) + + input(Eigen::array(i,j+2,k+2,l+0,m)) * kernel(Eigen::array(2,2,0)) + + input(Eigen::array(i,j+0,k+3,l+0,m)) * kernel(Eigen::array(0,3,0)) + + input(Eigen::array(i,j+1,k+3,l+0,m)) * kernel(Eigen::array(1,3,0)) + + input(Eigen::array(i,j+2,k+3,l+0,m)) * kernel(Eigen::array(2,3,0)) + + input(Eigen::array(i,j+0,k+0,l+1,m)) * kernel(Eigen::array(0,0,1)) + + input(Eigen::array(i,j+1,k+0,l+1,m)) * kernel(Eigen::array(1,0,1)) + + input(Eigen::array(i,j+2,k+0,l+1,m)) * kernel(Eigen::array(2,0,1)) + + input(Eigen::array(i,j+0,k+1,l+1,m)) * kernel(Eigen::array(0,1,1)) + + input(Eigen::array(i,j+1,k+1,l+1,m)) * kernel(Eigen::array(1,1,1)) + + input(Eigen::array(i,j+2,k+1,l+1,m)) * kernel(Eigen::array(2,1,1)) + + input(Eigen::array(i,j+0,k+2,l+1,m)) * kernel(Eigen::array(0,2,1)) + + input(Eigen::array(i,j+1,k+2,l+1,m)) * kernel(Eigen::array(1,2,1)) + + input(Eigen::array(i,j+2,k+2,l+1,m)) * kernel(Eigen::array(2,2,1)) + + input(Eigen::array(i,j+0,k+3,l+1,m)) * kernel(Eigen::array(0,3,1)) + + input(Eigen::array(i,j+1,k+3,l+1,m)) * kernel(Eigen::array(1,3,1)) + + input(Eigen::array(i,j+2,k+3,l+1,m)) * kernel(Eigen::array(2,3,1)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } +} + +static float* CudaCopyFloat(float* data, int size) { + const int nbytes = size * sizeof(float); + float* result = NULL; + if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { + return NULL; + } else { + if (data != NULL) { + cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); + } + return result; + } +} + +static void test_cuda_constant_broadcast() +{ + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Tensor t1(10); + for (int i = 0; i < 10; ++i) { + t1(i) = 10.0f * i; + } + float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); + Eigen::TensorMap > t1_gpu(t1_cuda, 10); + + Tensor t2(1); + t2 = t2.constant(20.0f); + float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); + Eigen::TensorMap > > t2_gpu(t2_cuda, 1); + + float* t3_cuda = CudaCopyFloat(NULL, 10); + Eigen::TensorMap > t3_gpu(t3_cuda, 10); + + t3_gpu.device(gpu_device) = + t1_gpu + t2_gpu.broadcast(Eigen::array(10)); + + Eigen::Tensor t3(10); + cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST(test_cuda_elementwise_small()); + CALL_SUBTEST(test_cuda_elementwise()); + CALL_SUBTEST(test_cuda_reduction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_convolution_1d()); + CALL_SUBTEST(test_cuda_convolution_2d()); + CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_constant_broadcast()); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 26465ee11..f2d7e4ce6 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -22,23 +22,23 @@ using Eigen::RowMajor; // Context for evaluation on cpu struct CPUContext { - CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(Eigen::array(2,2)), kernel_3d_(Eigen::array(2,2,2)) { + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { kernel_1d_(0) = 3.14f; kernel_1d_(1) = 2.7f; - kernel_2d_(Eigen::array(0,0)) = 3.14f; - kernel_2d_(Eigen::array(1,0)) = 2.7f; - kernel_2d_(Eigen::array(0,1)) = 0.2f; - kernel_2d_(Eigen::array(1,1)) = 7.0f; - - kernel_3d_(Eigen::array(0,0,0)) = 3.14f; - kernel_3d_(Eigen::array(0,1,0)) = 2.7f; - kernel_3d_(Eigen::array(0,0,1)) = 0.2f; - kernel_3d_(Eigen::array(0,1,1)) = 7.0f; - kernel_3d_(Eigen::array(1,0,0)) = -1.0f; - kernel_3d_(Eigen::array(1,1,0)) = -0.3f; - kernel_3d_(Eigen::array(1,0,1)) = -0.7f; - kernel_3d_(Eigen::array(1,1,1)) = -0.5f; + kernel_2d_(0,0) = 3.14f; + kernel_2d_(1,0) = 2.7f; + kernel_2d_(0,1) = 0.2f; + kernel_2d_(1,1) = 7.0f; + + kernel_3d_(0,0,0) = 3.14f; + kernel_3d_(0,1,0) = 2.7f; + kernel_3d_(0,0,1) = 0.2f; + kernel_3d_(0,1,1) = 7.0f; + kernel_3d_(1,0,0) = -1.0f; + kernel_3d_(1,1,0) = -0.3f; + kernel_3d_(1,0,1) = -0.7f; + kernel_3d_(1,1,1) = -0.5f; } const Eigen::DefaultDevice& device() const { return cpu_device_; } @@ -93,8 +93,8 @@ struct GPUContext { const Eigen::TensorMap >& in2() const { return in2_; } Eigen::TensorMap >& out() { return out_; } Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } - Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, Eigen::array(2, 2)); } - Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, Eigen::array(2, 2, 2)); } + Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, 2, 2); } + Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, 2, 2, 2); } private: const Eigen::TensorMap >& in1_; @@ -150,8 +150,8 @@ static void test_contraction(Context* context) template static void test_1d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(40,49,70)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,70); Eigen::array dims(1); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); @@ -160,8 +160,8 @@ static void test_1d_convolution(Context* context) template static void test_2d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(40,49,69)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,69); Eigen::array dims(1,2); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); @@ -170,8 +170,8 @@ static void test_2d_convolution(Context* context) template static void test_3d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(39,49,69)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(39,49,69); Eigen::array dims(0,1,2); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); @@ -179,9 +179,9 @@ static void test_3d_convolution(Context* context) static void test_cpu() { - Eigen::Tensor in1(Eigen::array(40,50,70)); - Eigen::Tensor in2(Eigen::array(40,50,70)); - Eigen::Tensor out(Eigen::array(40,50,70)); + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); @@ -191,7 +191,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -200,7 +200,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); } } } @@ -209,7 +209,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -217,11 +217,11 @@ static void test_cpu() { test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { - const float result = out(Eigen::array(i,j,0)); + const float result = out(i,j,0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { - expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); @@ -232,7 +232,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); } } } @@ -241,9 +241,9 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f) + - (in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { continue; } @@ -256,11 +256,11 @@ static void test_cpu() { for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f) + - (in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + - in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { continue; } @@ -271,9 +271,9 @@ static void test_cpu() { } static void test_gpu() { - Eigen::Tensor in1(Eigen::array(40,50,70)); - Eigen::Tensor in2(Eigen::array(40,50,70)); - Eigen::Tensor out(Eigen::array(40,50,70)); + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); @@ -291,9 +291,9 @@ static void test_gpu() { cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(40,50,70)); - Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(40,50,70)); - Eigen::TensorMap > gpu_out(d_out, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_in1(d_in1, 40,50,70); + Eigen::TensorMap > gpu_in2(d_in2, 40,50,70); + Eigen::TensorMap > gpu_out(d_out, 40,50,70); GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); @@ -301,7 +301,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -311,7 +311,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); } } } @@ -321,7 +321,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -330,11 +330,11 @@ static void test_gpu() { assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { - const float result = out(Eigen::array(i,j,0)); + const float result = out(i,j,0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { - expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); @@ -347,7 +347,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); } } } @@ -358,9 +358,9 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); VERIFY_IS_APPROX(expected, result); } } @@ -372,11 +372,11 @@ static void test_gpu() { for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f + - in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + - in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + + in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); VERIFY_IS_APPROX(expected, result); } } diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index c806b623f..0cc4e86f7 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -16,12 +16,15 @@ using Eigen::Tensor; static void test_dynamic_size() { - Eigen::DSizes dimensions(Eigen::array{{2,3,7}}); + Eigen::DSizes dimensions(2,3,7); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); + VERIFY_IS_EQUAL((int)dimensions[0], 2); + VERIFY_IS_EQUAL((int)dimensions[1], 3); + VERIFY_IS_EQUAL((int)dimensions[2], 7); } static void test_fixed_size() @@ -37,9 +40,9 @@ static void test_fixed_size() static void test_match() { - Eigen::DSizes dyn(Eigen::array{{2,3,7}}); + Eigen::DSizes dyn(2,3,7); Eigen::Sizes<2,3,7> stat; - VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); + VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true); } diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index e85fcbfa9..792fdeade 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -125,6 +125,12 @@ static void test_3d() mat7 = mat1.cwiseMax(mat5 * 2.0f).exp(); Tensor mat8(2,3,7); mat8 = (-mat2).exp() * 3.14f; + Tensor mat9(2,3,7); + mat9 = mat2 + 3.14f; + Tensor mat10(2,3,7); + mat10 = mat2 - 3.14f; + Tensor mat11(2,3,7); + mat11 = mat2 / 3.14f; val = 1.0; for (int i = 0; i < 2; ++i) { @@ -136,6 +142,9 @@ static void test_3d() VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f); + VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f); + VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f); + VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f); val += 1.0; } } @@ -172,6 +181,36 @@ static void test_constants() } } +static void test_boolean() +{ + Tensor vec(6); + std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data()); + + // Test ||. + Tensor bool1 = vec < vec.constant(1) || vec > vec.constant(4); + VERIFY_IS_EQUAL(bool1[0], true); + VERIFY_IS_EQUAL(bool1[1], false); + VERIFY_IS_EQUAL(bool1[2], false); + VERIFY_IS_EQUAL(bool1[3], false); + VERIFY_IS_EQUAL(bool1[4], false); + VERIFY_IS_EQUAL(bool1[5], true); + + // Test &&, including cast of operand vec. + Tensor bool2 = vec.cast() && vec < vec.constant(4); + VERIFY_IS_EQUAL(bool2[0], false); + VERIFY_IS_EQUAL(bool2[1], true); + VERIFY_IS_EQUAL(bool2[2], true); + VERIFY_IS_EQUAL(bool2[3], true); + VERIFY_IS_EQUAL(bool2[4], false); + VERIFY_IS_EQUAL(bool2[5], false); + + // Compilation tests: + // Test Tensor against results of cast or comparison; verifies that + // CoeffReturnType is set to match Op return type of bool for Unary and Binary + // Ops. + Tensor bool3 = vec.cast() && bool2; + bool3 = vec < vec.constant(4) && bool2; +} static void test_functors() { @@ -258,6 +297,7 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_constants()); + CALL_SUBTEST(test_boolean()); CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp index 529584a7b..ad9de867d 100644 --- a/unsupported/test/cxx11_tensor_forced_eval.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -45,7 +45,34 @@ static void test_simple() } +static void test_const() +{ + MatrixXf input(3,3); + input.setRandom(); + MatrixXf output = input; + output.rowwise() -= input.colwise().maxCoeff(); + + Eigen::array depth_dim; + depth_dim[0] = 0; + Tensor::Dimensions dims2d; + dims2d[0] = 1; + dims2d[1] = 3; + Eigen::array bcast; + bcast[0] = 3; + bcast[1] = 1; + const TensorMap> input_tensor(input.data(), 3, 3); + Tensor output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(output(i, j), output_tensor(i, j)); + } + } +} + + void test_cxx11_tensor_forced_eval() { CALL_SUBTEST(test_simple()); + CALL_SUBTEST(test_const()); } diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 55d35eac0..26854f5a4 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -28,6 +28,9 @@ static void test_simple_patch() VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7); for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); } @@ -51,6 +54,9 @@ static void test_simple_patch() if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { expected = tensor(d, r-1+i, c-2+j, b); } + if (entire_image_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected); } } @@ -68,6 +74,11 @@ static void test_simple_patch() VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); VERIFY_IS_EQUAL(twod_patch.dimension(4), 7); + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { int patchId = i+3*j; @@ -76,8 +87,13 @@ static void test_simple_patch() for (int d = 0; d < 2; ++d) { for (int b = 0; b < 7; ++b) { float expected = 0.0f; - if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { - expected = tensor(d, r-1+i, c-1+j, b); + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset, b); + } + if (twod_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected); } @@ -88,6 +104,156 @@ static void test_simple_patch() } } +// Verifies VALID padding (no padding) with incrementing values. +static void test_patch_padding_valid() +{ + int input_depth = 3; + int input_rows = 3; + int input_cols = 3; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies VALID padding (no padding) with the same value. +static void test_patch_padding_valid_same_value() +{ + int input_depth = 1; + int input_rows = 5; + int input_cols = 5; + int input_batches = 2; + int ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + tensor = tensor.constant(11.0f); + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies SAME padding. +static void test_patch_padding_same() +{ + int input_depth = 3; + int input_rows = 4; + int input_cols = 2; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // Based on the calculation described in TensorTraits.h, padding happens to be + // 0. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} static void test_patch_no_extra_dim() { @@ -103,6 +269,9 @@ static void test_patch_no_extra_dim() VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); } @@ -124,6 +293,9 @@ static void test_patch_no_extra_dim() if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { expected = tensor(d, r-1+i, c-2+j); } + if (entire_image_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected); } } @@ -139,6 +311,11 @@ static void test_patch_no_extra_dim() VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { int patchId = i+3*j; @@ -146,8 +323,13 @@ static void test_patch_no_extra_dim() for (int c = 0; c < 2; ++c) { for (int d = 0; d < 2; ++d) { float expected = 0.0f; - if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { - expected = tensor(d, r-1+i, c-1+j); + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset); + } + if (twod_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; } VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected); } @@ -181,6 +363,9 @@ static void test_imagenet_patches() if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { expected = l_in(d, r-5+i, c-5+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -208,6 +393,9 @@ static void test_imagenet_patches() if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { expected = l_in(d, r-4+i, c-4+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -235,6 +423,9 @@ static void test_imagenet_patches() if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { expected = l_in(d, r-3+i, c-3+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -262,6 +453,9 @@ static void test_imagenet_patches() if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { expected = l_in(d, r-1+i, c-1+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -271,10 +465,12 @@ static void test_imagenet_patches() } } - void test_cxx11_tensor_image_patch() { CALL_SUBTEST(test_simple_patch()); CALL_SUBTEST(test_patch_no_extra_dim()); + CALL_SUBTEST(test_patch_padding_valid()); + CALL_SUBTEST(test_patch_padding_valid_same_value()); + CALL_SUBTEST(test_patch_padding_same()); CALL_SUBTEST(test_imagenet_patches()); } diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index 478c20306..9cf2eb150 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -29,6 +29,7 @@ static void test_1d() vec1(4) = 23; vec2(4) = 4; vec1(5) = 42; vec2(5) = 5; + VERIFY_IS_EQUAL(vec1.rank(), 1); VERIFY_IS_EQUAL(vec1.size(), 6); VERIFY_IS_EQUAL(vec1.dimension(0), 6); @@ -69,10 +70,12 @@ static void test_2d() TensorMap> mat3(mat1.data(), 2, 3); TensorMap> mat4(mat2.data(), 2, 3); + VERIFY_IS_EQUAL(mat3.rank(), 2); VERIFY_IS_EQUAL(mat3.size(), 6); VERIFY_IS_EQUAL(mat3.dimension(0), 2); VERIFY_IS_EQUAL(mat3.dimension(1), 3); + VERIFY_IS_EQUAL(mat4.rank(), 2); VERIFY_IS_EQUAL(mat4.size(), 6); VERIFY_IS_EQUAL(mat4.dimension(0), 2); VERIFY_IS_EQUAL(mat4.dimension(1), 3); @@ -109,13 +112,15 @@ static void test_3d() } TensorMap> mat3(mat1.data(), 2, 3, 7); - TensorMap> mat4(mat2.data(), 2, 3, 7); + TensorMap> mat4(mat2.data(), array{{2, 3, 7}}); + VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); VERIFY_IS_EQUAL(mat3.dimension(0), 2); VERIFY_IS_EQUAL(mat3.dimension(1), 3); VERIFY_IS_EQUAL(mat3.dimension(2), 7); + VERIFY_IS_EQUAL(mat4.rank(), 3); VERIFY_IS_EQUAL(mat4.size(), 2*3*7); VERIFY_IS_EQUAL(mat4.dimension(0), 2); VERIFY_IS_EQUAL(mat4.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 78b0dade0..b4b0a55b6 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -89,19 +89,19 @@ static void test_reshape_as_lvalue() } } - +template static void test_simple_slice() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - Tensor slice1(1,1,1,1,1); + Tensor slice1(1,1,1,1,1); Eigen::DSizes indices(1,2,3,4,5); Eigen::DSizes sizes(1,1,1,1,1); slice1 = tensor.slice(indices, sizes); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - Tensor slice2(1,1,2,2,3); + Tensor slice2(1,1,2,2,3); Eigen::DSizes indices2(1,1,3,4,5); Eigen::DSizes sizes2(1,1,2,2,3); slice2 = tensor.slice(indices2, sizes2); @@ -114,7 +114,7 @@ static void test_simple_slice() } } - +// TODO(andydavis) Add RowMajor support when TensorContract supports RowMajor. static void test_slice_in_expr() { MatrixXf m1(7,7); MatrixXf m2(3,3); @@ -141,21 +141,28 @@ static void test_slice_in_expr() { VERIFY_IS_APPROX(res(i,j), m3(i,j)); } } -} + // Take an arbitrary slice of an arbitrarily sized tensor. + TensorMap> tensor4(m1.data(), 7, 7); + Tensor tensor6 = tensor4.reshape(DSizes(7*7)).exp().slice(DSizes(0), DSizes(35)); + for (int i = 0; i < 35; ++i) { + VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i])); + } +} +template static void test_slice_as_lvalue() { - Tensor tensor1(2,2,7); + Tensor tensor1(2,2,7); tensor1.setRandom(); - Tensor tensor2(2,2,7); + Tensor tensor2(2,2,7); tensor2.setRandom(); - Tensor tensor3(4,3,5); + Tensor tensor3(4,3,5); tensor3.setRandom(); - Tensor tensor4(4,3,2); + Tensor tensor4(4,3,2); tensor4.setRandom(); - Tensor result(4,5,7); + Tensor result(4,5,7); Eigen::DSizes sizes12(2,2,7); Eigen::DSizes first_slice(0,0,0); result.slice(first_slice, sizes12) = tensor1; @@ -190,10 +197,10 @@ static void test_slice_as_lvalue() } } - +template static void test_slice_raw_data() { - Tensor tensor(3,5,7,11); + Tensor tensor(3,5,7,11); tensor.setRandom(); Eigen::DSizes offsets(1,2,3,4); @@ -203,40 +210,78 @@ static void test_slice_raw_data() VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul); VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4)); - extents = Eigen::DSizes(2,1,1,1); - auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); - VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); - VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + if (DataLayout == ColMajor) { + extents = Eigen::DSizes(2,1,1,1); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + } else { + extents = Eigen::DSizes(1,1,1,2); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5)); + } extents = Eigen::DSizes(1,2,1,1); auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul); VERIFY_IS_EQUAL(slice3.data(), static_cast(0)); - offsets = Eigen::DSizes(0,2,3,4); - extents = Eigen::DSizes(3,2,1,1); - auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { - VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes(0,2,3,4); + extents = Eigen::DSizes(3,2,1,1); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + } + } + } else { + offsets = Eigen::DSizes(1,2,3,0); + extents = Eigen::DSizes(1,1,2,11); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul); + for (int l = 0; l < 11; ++l) { + for (int k = 0; k < 2; ++k) { + VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l)); + } } } - offsets = Eigen::DSizes(0,0,0,4); - extents = Eigen::DSizes(3,5,7,2); - auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 5; ++j) { + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes(0,0,0,4); + extents = Eigen::DSizes(3,5,7,2); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 2; ++l) { + int slice_index = i + 3 * (j + 5 * (k + 7 * l)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + } + } + } + } + } else { + offsets = Eigen::DSizes(1,0,0,0); + extents = Eigen::DSizes(2,5,7,11); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul); + for (int l = 0; l < 11; ++l) { for (int k = 0; k < 7; ++k) { - for (int l = 0; l < 2; ++l) { - int slice_index = i + 3 * (j + 5 * (k + 7 * l)); - VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 2; ++i) { + int slice_index = l + 11 * (k + 7 * (j + 5 * i)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l)); + } } } } + } offsets = Eigen::DSizes(0,0,0,0); @@ -247,14 +292,38 @@ static void test_slice_raw_data() } +static void test_composition() +{ + Eigen::Tensor matrix(7, 11); + matrix.setRandom(); + + const DSizes newDims{{1, 1, 11}}; + Eigen::Tensor tensor = + matrix.slice(DSizes(2, 0), DSizes(1, 11)).reshape(newDims); + + VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul); + VERIFY_IS_EQUAL(tensor.dimension(0), 1); + VERIFY_IS_EQUAL(tensor.dimension(1), 1); + VERIFY_IS_EQUAL(tensor.dimension(2), 11); + for (int i = 0; i < 11; ++i) { + VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i)); + } +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); CALL_SUBTEST(test_reshape_in_expr()); CALL_SUBTEST(test_reshape_as_lvalue()); - CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_simple_slice()); CALL_SUBTEST(test_slice_in_expr()); - CALL_SUBTEST(test_slice_as_lvalue()); - CALL_SUBTEST(test_slice_raw_data()); + CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_raw_data()); + CALL_SUBTEST(test_slice_raw_data()); + + CALL_SUBTEST(test_composition()); } diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp index 0ffa341c4..8d05d154e 100644 --- a/unsupported/test/cxx11_tensor_of_strings.cpp +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -8,19 +8,18 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "main.h" -#include + #include -using std::string; using Eigen::Tensor; using Eigen::TensorMap; static void test_assign() { - string data1[6]; - TensorMap> mat1(data1, 2, 3); - string data2[6]; - const TensorMap> mat2(data2, 2, 3); + std::string data1[6]; + TensorMap> mat1(data1, 2, 3); + std::string data2[6]; + const TensorMap> mat2(data2, 2, 3); for (int i = 0; i < 6; ++i) { std::ostringstream s1; @@ -31,16 +30,16 @@ static void test_assign() data2[i] = s2.str(); } - Tensor rslt1; + Tensor rslt1; rslt1 = mat1; - Tensor rslt2; + Tensor rslt2; rslt2 = mat2; - Tensor rslt3 = mat1; - Tensor rslt4 = mat2; + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; - Tensor rslt5(mat1); - Tensor rslt6(mat2); + Tensor rslt5(mat1); + Tensor rslt6(mat2); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -57,8 +56,8 @@ static void test_assign() static void test_concat() { - Tensor t1(2, 3); - Tensor t2(2, 3); + Tensor t1(2, 3); + Tensor t2(2, 3); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -71,7 +70,7 @@ static void test_concat() } } - Tensor result = t1.concatenate(t2, 1); + Tensor result = t1.concatenate(t2, 1); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 6); @@ -86,7 +85,7 @@ static void test_concat() static void test_slices() { - Tensor data(2, 6); + Tensor data(2, 6); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { std::ostringstream s1; @@ -99,8 +98,8 @@ static void test_slices() const Eigen::DSizes first_half{{0, 0}}; const Eigen::DSizes second_half{{0, 3}}; - Tensor t1 = data.slice(first_half, half_size); - Tensor t2 = data.slice(second_half, half_size); + Tensor t1 = data.slice(first_half, half_size); + Tensor t2 = data.slice(second_half, half_size); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -113,8 +112,8 @@ static void test_slices() static void test_additions() { - Tensor data1(3); - Tensor data2(3); + Tensor data1(3); + Tensor data2(3); for (int i = 0; i < 3; ++i) { data1(i) = "abc"; std::ostringstream s1; @@ -122,16 +121,26 @@ static void test_additions() data2(i) = s1.str(); } - Tensor sum = data1 + data2; + Tensor sum = data1 + data2; for (int i = 0; i < 3; ++i) { std::ostringstream concat; concat << "abc" << i; - string expected = concat.str(); + std::string expected = concat.str(); VERIFY_IS_EQUAL(sum(i), expected); } } +static void test_initialization() +{ + Tensor a(2, 3); + a.setConstant(std::string("foo")); + for (int i = 0; i < 2*3; ++i) { + VERIFY_IS_EQUAL(a(i), std::string("foo")); + } +} + + void test_cxx11_tensor_of_strings() { // Beware: none of this is likely to ever work on a GPU. @@ -139,4 +148,5 @@ void test_cxx11_tensor_of_strings() CALL_SUBTEST(test_concat()); CALL_SUBTEST(test_slices()); CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_initialization()); } diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index 6f74216dd..ffa19896e 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_padding() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array, 4> paddings; @@ -24,7 +25,7 @@ static void test_simple_padding() paddings[2] = std::make_pair(3, 4); paddings[3] = std::make_pair(0, 0); - Tensor padded; + Tensor padded; padded = tensor.pad(paddings); VERIFY_IS_EQUAL(padded.dimension(0), 2+0); @@ -47,9 +48,10 @@ static void test_simple_padding() } } +template static void test_padded_expr() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array, 4> paddings; @@ -62,17 +64,19 @@ static void test_padded_expr() reshape_dims[0] = 12; reshape_dims[1] = 84; - Tensor result; + Tensor result; result = tensor.pad(paddings).reshape(reshape_dims); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 6; ++j) { for (int k = 0; k < 12; ++k) { for (int l = 0; l < 7; ++l) { + const float result_value = DataLayout == ColMajor ? + result(i+2*j,k+12*l) : result(j+6*i,l+7*k); if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(result(i+2*j,k+12*l), tensor(i,j-2,k-3,l)); + VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); } else { - VERIFY_IS_EQUAL(result(i+2*j,k+12*l), 0.0f); + VERIFY_IS_EQUAL(result_value, 0.0f); } } } @@ -80,9 +84,10 @@ static void test_padded_expr() } } - void test_cxx11_tensor_padding() { - CALL_SUBTEST(test_simple_padding()); - CALL_SUBTEST(test_padded_expr()); + CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_padded_expr()); + CALL_SUBTEST(test_padded_expr()); } diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp index e2ba5bfd8..0ee7b46d4 100644 --- a/unsupported/test/cxx11_tensor_patch.cpp +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -36,6 +36,23 @@ static void test_simple_patch() VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); } + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + Tensor single_patch; + single_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + patch_dims[0] = 1; patch_dims[1] = 2; patch_dims[2] = 2; diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index da9885166..99e19eba4 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -13,15 +13,15 @@ using Eigen::Tensor; -static void test_simple_reductions() -{ - Tensor tensor(2,3,5,7); +template +static void test_simple_reductions() { + Tensor tensor(2, 3, 5, 7); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; - Tensor result = tensor.sum(reduction_axis); + Tensor result = tensor.sum(reduction_axis); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 5); for (int i = 0; i < 2; ++i) { @@ -36,6 +36,53 @@ static void test_simple_reductions() } } + { + Tensor sum1 = tensor.sum(); + VERIFY_IS_EQUAL(sum1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor sum2 = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(sum2.dimension(0), 1); + + VERIFY_IS_APPROX(sum1(0), sum2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float prod = 1.0f; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + prod *= tensor(k, i, l, j); + } + } + VERIFY_IS_APPROX(result(i, j), prod); + } + } + + { + Tensor prod1 = tensor.prod(); + VERIFY_IS_EQUAL(prod1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor prod2 = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(prod2.dimension(0), 1); + + VERIFY_IS_APPROX(prod1(0), prod2(0)); + } + reduction_axis[0] = 0; reduction_axis[1] = 2; result = tensor.maximum(reduction_axis); @@ -53,6 +100,21 @@ static void test_simple_reductions() } } + { + Tensor max1 = tensor.maximum(); + VERIFY_IS_EQUAL(max1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor max2 = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(max2.dimension(0), 1); + + VERIFY_IS_APPROX(max1(0), max2(0)); + } + reduction_axis[0] = 0; reduction_axis[1] = 1; result = tensor.minimum(reduction_axis); @@ -63,24 +125,72 @@ static void test_simple_reductions() float min_val = (std::numeric_limits::max)(); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 3; ++l) { - min_val = (std::min)(min_val, tensor(k, l, i, j)); + min_val = (std::min)(min_val, tensor(k, l, i, j)); } } VERIFY_IS_APPROX(result(i, j), min_val); } } -} + { + Tensor min1 = tensor.minimum(); + VERIFY_IS_EQUAL(min1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor min2 = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(min2.dimension(0), 1); -static void test_full_reductions() -{ - Tensor tensor(2,3); + VERIFY_IS_APPROX(min1(0), min2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float sum = 0.0f; + int count = 0; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + sum += tensor(k, l, i, j); + ++count; + } + } + VERIFY_IS_APPROX(result(i, j), sum / count); + } + } + + { + Tensor mean1 = tensor.mean(); + VERIFY_IS_EQUAL(mean1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor mean2 = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(mean2.dimension(0), 1); + + VERIFY_IS_APPROX(mean1(0), mean2(0)); + } +} + +template +static void test_full_reductions() { + Tensor tensor(2, 3); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; - Tensor result = tensor.sum(reduction_axis); + Tensor result = tensor.sum(reduction_axis); VERIFY_IS_EQUAL(result.dimension(0), 1); float sum = 0.0f; @@ -103,30 +213,26 @@ static void test_full_reductions() VERIFY_IS_APPROX(result(0), sqrtf(sum)); } - struct UserReducer { - UserReducer(float offset) : offset_(offset), sum_(0.0f) {} - void reduce(const float val) { - sum_ += val * val; - } - float finalize() const { - return 1.0f / (sum_ + offset_); - } + static const bool PacketAccess = false; + UserReducer(float offset) : offset_(offset) {} + void reduce(const float val, float* accum) { *accum += val * val; } + float initialize() const { return 0; } + float finalize(const float accum) const { return 1.0f / (accum + offset_); } private: - float offset_; - float sum_; + const float offset_; }; -static void test_user_defined_reductions() -{ - Tensor tensor(5,7); +template +static void test_user_defined_reductions() { + Tensor tensor(5, 7); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 1; UserReducer reducer(10.0f); - Tensor result = tensor.reduce(reduction_axis, reducer); + Tensor result = tensor.reduce(reduction_axis, reducer); VERIFY_IS_EQUAL(result.dimension(0), 5); for (int i = 0; i < 5; ++i) { float expected = 10.0f; @@ -138,22 +244,24 @@ static void test_user_defined_reductions() } } - -static void test_tensor_maps() -{ - int inputs[2*3*5*7]; - TensorMap > tensor_map(inputs, 2,3,5,7); - TensorMap > tensor_map_const(inputs, 2,3,5,7); - const TensorMap > tensor_map_const_const(inputs, 2,3,5,7); +template +static void test_tensor_maps() { + int inputs[2 * 3 * 5 * 7]; + TensorMap > tensor_map(inputs, 2, 3, 5, 7); + TensorMap > tensor_map_const(inputs, 2, 3, 5, + 7); + const TensorMap > tensor_map_const_const( + inputs, 2, 3, 5, 7); tensor_map.setRandom(); array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; - Tensor result = tensor_map.sum(reduction_axis); - Tensor result2 = tensor_map_const.sum(reduction_axis); - Tensor result3 = tensor_map_const_const.sum(reduction_axis); + Tensor result = tensor_map.sum(reduction_axis); + Tensor result2 = tensor_map_const.sum(reduction_axis); + Tensor result3 = + tensor_map_const_const.sum(reduction_axis); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 5; ++j) { @@ -170,11 +278,110 @@ static void test_tensor_maps() } } +template +static void test_static_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 97); + in.setRandom(); + +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; +#else + Eigen::IndexList, Eigen::type2index<3> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template +static void test_innermost_last_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(97, 113); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; +#else + // This triggers the use of packets for ColMajor. + Eigen::IndexList, Eigen::type2index<1> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 97; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 72; ++l) { + expected = (std::max)(expected, in(l, k, i, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template +static void test_innermost_first_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 2; + reduction_axis[1] = 3; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList, Eigen::type2index<3>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + float expected = -1e10f; + for (int k = 0; k < 97; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, j, k, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} -void test_cxx11_tensor_reduction() -{ - CALL_SUBTEST(test_simple_reductions()); - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_user_defined_reductions()); - CALL_SUBTEST(test_tensor_maps()); +void test_cxx11_tensor_reduction() { + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_tensor_maps()); + CALL_SUBTEST(test_tensor_maps()); + CALL_SUBTEST(test_static_dims()); + CALL_SUBTEST(test_static_dims()); + CALL_SUBTEST(test_innermost_last_dims()); + CALL_SUBTEST(test_innermost_last_dims()); + CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_first_dims()); } diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 39c623499..ec623e1f9 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -14,9 +14,10 @@ using Eigen::Tensor; using Eigen::array; +template static void test_simple_shuffling() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; shuffles[0] = 0; @@ -24,7 +25,7 @@ static void test_simple_shuffling() shuffles[2] = 2; shuffles[3] = 3; - Tensor no_shuffle; + Tensor no_shuffle; no_shuffle = tensor.shuffle(shuffles); VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); @@ -46,7 +47,7 @@ static void test_simple_shuffling() shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - Tensor shuffle; + Tensor shuffle; shuffle = tensor.shuffle(shuffles); VERIFY_IS_EQUAL(shuffle.dimension(0), 5); @@ -66,9 +67,10 @@ static void test_simple_shuffling() } +template static void test_expr_shuffling() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; @@ -76,10 +78,10 @@ static void test_expr_shuffling() shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - Tensor expected; + Tensor expected; expected = tensor.shuffle(shuffles); - Tensor result(5,7,3,2); + Tensor result(5,7,3,2); array src_slice_dim{{2,3,1,7}}; array src_slice_start{{0,0,0,0}}; @@ -128,16 +130,17 @@ static void test_expr_shuffling() } +template static void test_shuffling_as_value() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; shuffles[2] = 0; shuffles[3] = 1; shuffles[1] = 2; shuffles[0] = 3; - Tensor shuffle(5,7,3,2); + Tensor shuffle(5,7,3,2); shuffle.shuffle(shuffles) = tensor; VERIFY_IS_EQUAL(shuffle.dimension(0), 5); @@ -158,7 +161,10 @@ static void test_shuffling_as_value() void test_cxx11_tensor_shuffling() { - CALL_SUBTEST(test_simple_shuffling()); - CALL_SUBTEST(test_expr_shuffling()); - CALL_SUBTEST(test_shuffling_as_value()); + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_shuffling_as_value()); + CALL_SUBTEST(test_shuffling_as_value()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index a70591c82..23855fca0 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -32,6 +32,7 @@ static void test_1d() vec1(5) = 42; vec2(5) = 5; vec3(5) = 0; vec4.setZero(); + VERIFY_IS_EQUAL((vec1.rank()), 1); VERIFY_IS_EQUAL((vec1.size()), 6); VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); @@ -99,10 +100,12 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; + VERIFY_IS_EQUAL((mat1.rank()), 2); VERIFY_IS_EQUAL((mat1.size()), 6); VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3); + VERIFY_IS_EQUAL((mat2.rank()), 2); VERIFY_IS_EQUAL((mat2.size()), 6); VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3); diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 502569d1d..1feb39dca 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_striding() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array strides; strides[0] = 1; @@ -23,7 +24,7 @@ static void test_simple_striding() strides[2] = 1; strides[3] = 1; - Tensor no_stride; + Tensor no_stride; no_stride = tensor.stride(strides); VERIFY_IS_EQUAL(no_stride.dimension(0), 2); @@ -45,7 +46,7 @@ static void test_simple_striding() strides[1] = 4; strides[2] = 2; strides[3] = 3; - Tensor stride; + Tensor stride; stride = tensor.stride(strides); VERIFY_IS_EQUAL(stride.dimension(0), 1); @@ -65,7 +66,36 @@ static void test_simple_striding() } +template +static void test_striding_as_lvalue() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + Tensor result(3, 12, 10, 21); + result.stride(strides) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l)); + } + } + } + } +} + + void test_cxx11_tensor_striding() { - CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_striding_as_lvalue()); + CALL_SUBTEST(test_striding_as_lvalue()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index f0de61f8b..e25912279 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,11 +9,11 @@ #define EIGEN_USE_THREADS -#include + #include "main.h" +#include #include - using Eigen::Tensor; static void test_multithread_elementwise() @@ -60,12 +60,12 @@ static void test_multithread_compound_assignment() } } - +template static void test_multithread_contraction() { - Tensor t_left(30, 50, 37, 31); - Tensor t_right(37, 31, 70, 2, 10); - Tensor t_result(30, 50, 70, 2, 10); + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); t_left.setRandom(); t_right.setRandom(); @@ -74,11 +74,10 @@ static void test_multithread_contraction() typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); - - typedef Map MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 1147); MapXf m_right(t_right.data(), 1147, 1400); - MatrixXf m_result(1500, 1400); + Matrix m_result(1500, 1400); Eigen::ThreadPoolDevice thread_pool_device(4); @@ -95,12 +94,12 @@ static void test_multithread_contraction() } } - +template static void test_contraction_corner_cases() { - Tensor t_left(32, 500); - Tensor t_right(32, 28*28); - Tensor t_result(500, 28*28); + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; @@ -110,10 +109,10 @@ static void test_contraction_corner_cases() typedef Tensor::DimensionPair DimPair; Eigen::array dims{{DimPair(0, 0)}}; - typedef Map MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 32, 500); MapXf m_right(t_right.data(), 32, 28*28); - MatrixXf m_result(500, 28*28); + Matrix m_result(500, 28*28); Eigen::ThreadPoolDevice thread_pool_device(12); @@ -181,18 +180,18 @@ static void test_contraction_corner_cases() } } - +template static void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(1, 5000); - Tensor left(internal::random(1, 80), - contract_size, - internal::random(1, 100)); + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); - Tensor right(internal::random(1, 25), - internal::random(1, 37), - contract_size, - internal::random(1, 51)); + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); left.setRandom(); right.setRandom(); @@ -206,13 +205,13 @@ static void test_multithread_contraction_agrees_with_singlethread() { Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); - Tensor st_result; + Tensor st_result; st_result = left.contract(right, dims); - Tensor tp_result(st_result.dimensions()); + Tensor tp_result(st_result.dimensions()); tp_result.device(thread_pool_device) = left.contract(right, dims); - VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) @@ -241,17 +240,30 @@ static void test_memcpy() { } +static void test_multithread_random() +{ + Eigen::ThreadPoolDevice device(2); + Tensor t(1 << 20); + t.device(device) = t.random>(); +} + + void test_cxx11_tensor_thread_pool() { CALL_SUBTEST(test_multithread_elementwise()); CALL_SUBTEST(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST(test_contraction_corner_cases()); CALL_SUBTEST(test_memcpy()); + + CALL_SUBTEST(test_multithread_random()); } -- cgit v1.2.3 From 641e824c56db8fffb2f6091d18f913e040c1ea95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 15 Jan 2015 11:11:48 -0800 Subject: Added cube() operation --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++++ unsupported/test/cxx11_tensor_expr.cpp | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 8860f622b..e08ac6aa1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -83,6 +83,12 @@ class TensorBase return unaryExpr(internal::scalar_square_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> inverse() const { diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 792fdeade..695565e9b 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -32,6 +32,9 @@ static void test_1d() float data4[6]; TensorMap> vec4(data4, 6); vec4 = vec2.square(); + float data5[6]; + TensorMap> vec5(data5, 6); + vec5 = vec2.cube(); VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); @@ -47,6 +50,13 @@ static void test_1d() VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f); VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f); + VERIFY_IS_APPROX(vec5(0), 0.0f); + VERIFY_IS_APPROX(vec5(1), 1.0f); + VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f); + VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f); + VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f); + VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f); + vec3 = vec1 + vec2; VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); -- cgit v1.2.3 From 14f537c296710173c76379d8efec59bfb1d78eb7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 16 Jan 2015 09:09:23 -0800 Subject: gcc doesn't consider that template TensorStridingOp& operator = (const OtherDerived& other) provides a valid assignment operator for the striding operation, and therefore refuses to compile code like: result.stride(foo) = source.stride(bar); Added the explicit TensorStridingOp& operator = (const TensorStridingOp& other) as a workaround to get the code to compile, and did the same in all the operations that can be used as lvalues. --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 10 ++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 19 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 9 +++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 9 +++++++++ unsupported/test/cxx11_tensor_chipping.cpp | 21 +++++++++++++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 13 +++++++++++++ unsupported/test/cxx11_tensor_shuffling.cpp | 17 +++++++++++++++++ unsupported/test/cxx11_tensor_striding.cpp | 18 ++++++++++++++++++ 8 files changed, 116 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index bc336e488..503803d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -101,6 +101,16 @@ class TensorChippingOp : public TensorBase > const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 23b595ac3..87a4b0758 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -73,6 +73,15 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) @@ -257,6 +266,16 @@ class TensorSlicingOp : public TensorBase Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: typename XprType::Nested m_xpr; const StartIndices m_indices; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index ab5fc6a69..620a63ae7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -73,6 +73,15 @@ class TensorShufflingOp : public TensorBase const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2fbdfadfe..5aa2c8d3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -73,6 +73,15 @@ class TensorStridingOp : public TensorBase > const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 0de7bbac6..d83417872 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -318,8 +318,29 @@ static void test_chip_as_lvalue() } } } + + Tensor input7(2,3,5,7,11); + input7.setRandom(); + tensor = input1; + tensor.chip(0, 0) = input7.chip(0, 0); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 0) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); + } + } + } + } + } + } } + template static void test_chip_raw_data() { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index b4b0a55b6..7fd7a283a 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -161,6 +161,8 @@ static void test_slice_as_lvalue() tensor3.setRandom(); Tensor tensor4(4,3,2); tensor4.setRandom(); + Tensor tensor5(10,13,12); + tensor5.setRandom(); Tensor result(4,5,7); Eigen::DSizes sizes12(2,2,7); @@ -195,6 +197,17 @@ static void test_slice_as_lvalue() } } } + + Eigen::DSizes sizes5(4,5,7); + Eigen::DSizes fifth_slice(0,0,0); + result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5); + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k)); + } + } + } } template diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index ec623e1f9..2f7fd9e50 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -157,6 +157,23 @@ static void test_shuffling_as_value() } } } + + array no_shuffle; + no_shuffle[0] = 0; + no_shuffle[1] = 1; + no_shuffle[2] = 2; + no_shuffle[3] = 3; + Tensor shuffle2(5,7,3,2); + shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 2; ++l) { + VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l)); + } + } + } + } } void test_cxx11_tensor_shuffling() diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 1feb39dca..935b908cc 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -89,6 +89,24 @@ static void test_striding_as_lvalue() } } } + + array no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; + Tensor result2(3, 12, 10, 21); + result2.stride(strides) = tensor.stride(no_strides); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l)); + } + } + } + } } -- cgit v1.2.3 From 46fc881e4ae23ef577ee20dcd61a5a74cba8b874 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 26 Jan 2015 17:46:40 -0800 Subject: Added a few benchmarks for the tensor code --- bench/tensors/tensor_benchmarks.h | 305 +++++++++++++++++++++++++++++++++ bench/tensors/tensor_benchmarks_cpu.cc | 156 +++++++++++++++++ bench/tensors/tensor_benchmarks_gpu.cc | 75 ++++++++ 3 files changed, 536 insertions(+) create mode 100644 bench/tensors/tensor_benchmarks.h create mode 100644 bench/tensors/tensor_benchmarks_cpu.cc create mode 100644 bench/tensors/tensor_benchmarks_gpu.cc diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h new file mode 100644 index 000000000..525b9acda --- /dev/null +++ b/bench/tensors/tensor_benchmarks.h @@ -0,0 +1,305 @@ +#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ +#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ + +typedef int TensorIndex; +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "testing/base/public/benchmark.h" + +using Eigen::Tensor; +using Eigen::TensorMap; + + +// TODO(bsteiner): also templatize on the input type since we have users +// for int8 as well as floats. +template class BenchmarkSuite { + public: + BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) + : m_(m), k_(k), n_(n), device_(device) { + initialize(); + } + + BenchmarkSuite(const Device& device, size_t m) + : m_(m), k_(m), n_(m), device_(device) { + initialize(); + } + + ~BenchmarkSuite() { + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + } + + void memcpy(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void random(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = C.random(); + } + // Record the number of random numbers generated per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void slicing(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + const Eigen::DSizes quarter_sizes(Eigen::array(m_/2, m_/2)); + const Eigen::DSizes first_quadrant(Eigen::array(0, 0)); + const Eigen::DSizes second_quadrant(Eigen::array(0, m_/2)); + const Eigen::DSizes third_quadrant(Eigen::array(m_/2, 0)); + const Eigen::DSizes fourth_quadrant(Eigen::array(m_/2, m_/2)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } + // Record the number of values copied from the rhs slice to the lhs slice + // each second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void shuffling(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array size_a(m_, k_); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(k_, m_); + TensorMap, Eigen::Aligned> B(b_, size_b); + + const Eigen::array shuffle(1, 0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } + // Record the number of values shuffled from A and copied to B each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void padding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array size_a(m_, k_-3); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(k_, m_); + TensorMap, Eigen::Aligned> B(b_, size_b); + + Eigen::array, 2> paddings; + paddings[0] = Eigen::IndexPair(0, 0); + paddings[1] = Eigen::IndexPair(2, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.pad(paddings); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void striding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array size_a(m_, k_); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(m_, k_ / 2); + TensorMap, Eigen::Aligned> B(b_, size_b); + + const Eigen::array strides(1, 2); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.stride(strides); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void broadcasting(int num_iters) { + const Eigen::array size_a(m_, 1); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_c(m_, n_); + TensorMap, Eigen::Aligned> C(c_, size_c); + +#if defined(__CUDACC__) + // nvcc doesn't support cxx11 + const Eigen::array broadcast(1, n_); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList, int> broadcast; + broadcast.set(1, n_); +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } + // Record the number of values broadcasted from A and copied to C each second + finalizeBenchmark(m_ * n_ * num_iters); + } + + void coeffWiseOp(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + } + // Record the number of FLOP executed per second (2 multiplications and + // 1 addition per value) + finalizeBenchmark(3 * m_ * m_ * num_iters); + } + + void algebraicFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + void transcendentalFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.exp() + B.log(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // Simple reduction + void reduction(int num_iters) { + const Eigen::array input_size(k_, n_); + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size(n_); + TensorMap, Eigen::Aligned> C(c_, output_size); + + const Eigen::array sum_along_dim(0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // do a contraction which is equivalent to a matrix multiplication + void contraction(int num_iters) { + const Eigen::array sizeA(m_, k_); + const Eigen::array sizeB(k_, n_); + const Eigen::array sizeC(m_, n_); + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + } + + void convolution(int num_iters, int kernel_x, int kernel_y) { + const Eigen::array input_sizes(m_, n_); + TensorMap, Eigen::Aligned> A(a_, input_sizes); + const Eigen::array kernel_sizes(kernel_x, kernel_y); + TensorMap, Eigen::Aligned> B(b_, kernel_sizes); + const Eigen::array result_sizes( + m_ - kernel_x + 1, n_ - kernel_y + 1); + TensorMap, Eigen::Aligned> C(c_, result_sizes); + Eigen::array::Index, 2> dims(0, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.convolve(B, dims); + } + // Record the number of FLOP executed per second (kernel_size + // multiplications and additions for each value in the resulting tensor) + finalizeBenchmark( + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + } + + private: + void initialize() { + a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); + b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); + c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(float)); + device_.memset(b_, 23, k_ * n_ * sizeof(float)); + device_.memset(c_, 31, m_ * n_ * sizeof(float)); + + BenchmarkUseRealTime(); + } + + inline void finalizeBenchmark(int64 num_items) { +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + if (Eigen::internal::is_same::value) { + device_.synchronize(); + } +#endif + StopBenchmarkTiming(); + SetBenchmarkItemsProcessed(num_items); + } + + + size_t m_; + size_t k_; + size_t n_; + float* a_; + float* b_; + float* c_; + Device device_; +}; +#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc new file mode 100644 index 000000000..68653ba15 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -0,0 +1,156 @@ +#define EIGEN_USE_THREADS + +#include "base/sysinfo.h" +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" +#include "thread/threadpool.h" + +#ifdef __ANDROID__ +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPoolDevice device(threads); +#else +#define CREATE_THREAD_POOL(threads) \ +ThreadPool tp(threads); \ +tp.StartWorkers(); \ +Eigen::ThreadPoolDevice device(&tp, threads); +#endif + +// Simple functions +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); + +BM_FuncCPU(memcpy, 4); +BM_FuncCPU(memcpy, 8); +BM_FuncCPU(memcpy, 12); + +BM_FuncCPU(random, 4); +BM_FuncCPU(random, 8); +BM_FuncCPU(random, 12); + +BM_FuncCPU(slicing, 4); +BM_FuncCPU(slicing, 8); +BM_FuncCPU(slicing, 12); + +BM_FuncCPU(shuffling, 4); +BM_FuncCPU(shuffling, 8); +BM_FuncCPU(shuffling, 12); + +BM_FuncCPU(padding, 4); +BM_FuncCPU(padding, 8); +BM_FuncCPU(padding, 12); + +BM_FuncCPU(striding, 4); +BM_FuncCPU(striding, 8); +BM_FuncCPU(striding, 12); + +BM_FuncCPU(broadcasting, 4); +BM_FuncCPU(broadcasting, 8); +BM_FuncCPU(broadcasting, 12); + +BM_FuncCPU(coeffWiseOp, 4); +BM_FuncCPU(coeffWiseOp, 8); +BM_FuncCPU(coeffWiseOp, 12); + +BM_FuncCPU(algebraicFunc, 4); +BM_FuncCPU(algebraicFunc, 8); +BM_FuncCPU(algebraicFunc, 12); + +BM_FuncCPU(transcendentalFunc, 4); +BM_FuncCPU(transcendentalFunc, 8); +BM_FuncCPU(transcendentalFunc, 12); + +BM_FuncCPU(reduction, 4); +BM_FuncCPU(reduction, 8); +BM_FuncCPU(reduction, 12); + + +// Contractions +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); + + +BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); + + +// Convolutions +#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ + static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); + +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); + +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); + +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); + +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc new file mode 100644 index 000000000..adea754ad --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -0,0 +1,75 @@ +#define EIGEN_USE_GPU + +#include +#include +#include +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" + + + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); -- cgit v1.2.3 From 9dfdbd7e568bd3aa9a4610986dcfc679b9ea425d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 27 Jan 2015 14:15:31 -0800 Subject: mproved the performance of tensor reductions that preserve the inner most dimension(s). --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 64 +++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eebcc4850..c6a8ecb5d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -48,6 +48,11 @@ template struct are_inner_most_dims { static const bool value = false; }; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + #if __cplusplus > 199711L template struct are_inner_most_dims{ @@ -61,6 +66,16 @@ struct are_inner_most_dims{ index_statically_eq()(0, NumTensorDims - array_size::value) && index_statically_eq()(array_size::value - 1, NumTensorDims - 1); }; +template +struct preserve_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_gt()(0, 0); +}; +template +struct preserve_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_lt()(array_size::value - 1, NumTensorDims - 1); +}; #endif @@ -108,7 +123,35 @@ struct InnerMostDimReducer { for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizePacket(accum, p); + return reducer.finalizeBoth(accum, p); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet(input), accum); + } } }; @@ -168,11 +211,14 @@ struct TensorEvaluator, Device> }; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()) { EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); // Bitmap indicating if an input dimension is reduced or not. array reduced; @@ -291,6 +337,20 @@ struct TensorEvaluator, Device> values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } } else { for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index + i); @@ -305,6 +365,7 @@ struct TensorEvaluator, Device> private: template friend struct internal::GenericDimReducer; template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimPreserver; // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". @@ -316,6 +377,7 @@ struct TensorEvaluator, Device> return index * m_preservedStrides[NumOutputDims - 1]; } } + // TBD: optimize the case where we preserve the innermost dimensions. Index startInput = 0; if (Layout == ColMajor) { for (int i = NumOutputDims - 1; i > 0; --i) { -- cgit v1.2.3 From 5a6ea4edf61b5626a781070c6342fc16606b490a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 28 Jan 2015 10:02:47 -0800 Subject: Added more tests to cover tensor reductions --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 43 +++++++++++---- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 62 +++++++++++++++++++++- unsupported/test/cxx11_tensor_reduction.cpp | 37 ++++++++++++- 3 files changed, 128 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 7b8d34321..38586d067 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -37,7 +37,11 @@ template struct SumReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return saccum + predux(vaccum); } }; @@ -45,16 +49,16 @@ template struct SumReducer template struct MeanReducer { static const bool PacketAccess = true; - MeanReducer() : count_(0) { } + MeanReducer() : scalarCount_(0), packetCount_(0) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { (*accum) += t; - count_++; + scalarCount_++; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { (*accum) = padd(*accum, p); - count_ += packet_traits::size; + packetCount_++; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { @@ -65,15 +69,20 @@ template struct MeanReducer return pset1(0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / count_; + return accum / scalarCount_; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { - return (saccum + predux(vaccum)) / count_; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(packetCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits::size); } protected: - int count_; + int scalarCount_; + int packetCount_; }; template struct MaxReducer @@ -99,7 +108,11 @@ template struct MaxReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return (std::max)(saccum, predux_max(vaccum)); } }; @@ -127,7 +140,11 @@ template struct MinReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return (std::min)(saccum, predux_min(vaccum)); } }; @@ -156,7 +173,11 @@ template struct ProdReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return saccum * predux_mul(vaccum); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 209749042..7ff47673d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -181,7 +181,7 @@ template size_t array_prod(const Ind result *= sizes[i]; } return result; -} +}; template struct array_size > { static const size_t value = std::tuple_size >::value; @@ -307,6 +307,52 @@ struct index_statically_ne > { }; +template +struct index_statically_gt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_gt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] > value; + } +}; + +template +struct index_statically_gt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] > value; + } +}; + +template +struct index_statically_lt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_lt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] < value; + } +}; + +template +struct index_statically_lt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] < value; + } +}; + } // end namespace internal } // end namespace Eigen @@ -351,6 +397,20 @@ struct index_statically_ne { } }; +template +struct index_statically_gt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_lt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 99e19eba4..5c3184833 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -369,6 +369,37 @@ static void test_innermost_first_dims() { } } +template +static void test_reduce_middle_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 2; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList, Eigen::type2index<2>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 97; ++l) { + expected = (std::max)(expected, in(i, k, l, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_simple_reductions()); CALL_SUBTEST(test_simple_reductions()); @@ -380,8 +411,10 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_tensor_maps()); CALL_SUBTEST(test_static_dims()); CALL_SUBTEST(test_static_dims()); - CALL_SUBTEST(test_innermost_last_dims()); CALL_SUBTEST(test_innermost_last_dims()); - CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_last_dims()); CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_reduce_middle_dims()); + CALL_SUBTEST(test_reduce_middle_dims()); } -- cgit v1.2.3 From e896c0ade7c77a18acb1b3ef01f22ef698c1a2a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 29 Jan 2015 10:29:47 -0800 Subject: Marked the contraction operation as read only, since its result can't be assigned. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index a02a273e7..af843654c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -423,7 +423,7 @@ struct traits -class TensorContractionOp : public TensorBase > +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; -- cgit v1.2.3 From 590f4b0aa3583c98fe9a0682e26c24ebfaffeaa6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 30 Jan 2015 19:46:30 -0800 Subject: Silenced some compilation warnings --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 22 +++++++++++----------- .../Eigen/CXX11/src/Tensor/TensorInitializer.h | 12 ------------ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 7ff47673d..c94ed977e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -124,18 +124,18 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } template - static constexpr bool values_up_to_known_statically(const std::tuple& t) { + static constexpr bool values_up_to_known_statically(const std::tuple&) { return is_compile_time_constant >::type>::value; } template - static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple&) { return true; } }; @@ -271,7 +271,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -279,7 +279,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -294,7 +294,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -302,7 +302,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -318,7 +318,7 @@ template struct index_statically_gt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] > value; + (IndexList()[i] > value); } }; @@ -326,7 +326,7 @@ template struct index_statically_gt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] > value; + (IndexList()[i] > value); } }; @@ -341,7 +341,7 @@ template struct index_statically_lt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] < value; + (IndexList()[i] < value); } }; @@ -349,7 +349,7 @@ template struct index_statically_lt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] < value; + (IndexList()[i] < value); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 6afef0fbb..4303e3536 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -55,18 +55,6 @@ struct Initializer { } }; -template -struct Initializer { - typedef std::initializer_list::Scalar> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - // Static initialization not implemented for VarDims tensors. - eigen_assert(false); - } -}; - template void initialize_tensor(TensorEvaluator& tensor, const typename Initializer::NumDimensions>::InitList& vals) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c6a8ecb5d..83ba1df71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -129,7 +129,7 @@ struct InnerMostDimReducer { template struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); } }; -- cgit v1.2.3 From f64045a060ae22c6445b78ecea3783cef7c1ca3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 30 Jan 2015 19:52:01 -0800 Subject: Silenced a few more compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index e125ca799..0e8a4b8d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -369,7 +369,7 @@ class Tensor : public TensorBase > void resize(const DSizes& dimensions) { array dims; - for (int i = 0; i < NumIndices; ++i) { + for (std::size_t i = 0; i < NumIndices; ++i) { dims[i] = dimensions[i]; } resize(dims); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 6c9a67c58..d81197e6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -93,7 +93,7 @@ struct Sizes : internal::numeric_list { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex... indices) { } + template Sizes(DenseIndex...) { } explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } @@ -333,7 +333,7 @@ static const size_t value = Sizes::count; template struct array_size > { static const size_t value = Sizes::count; }; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { return get >::value; } #else -- cgit v1.2.3 From dcb2a8b184c43f9b638406c39c1636e1ff2b1e23 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Feb 2015 02:51:59 -0800 Subject: Added the EIGEN_HAS_CONSTEXPR define Gate the tensor index list code based on the value of EIGEN_HAS_CONSTEXPR --- Eigen/src/Core/util/Macros.h | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/test/cxx11_tensor_index_list.cpp | 4 ++++ unsupported/test/cxx11_tensor_reduction.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 001907a0b..40a28d4d6 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -133,6 +133,12 @@ #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif +// Does the compiler support const expressions? +#if (defined(__plusplus) && __cplusplus >= 201402L) || \ + EIGEN_GNUC_AT_LEAST(4,9) +#define EIGEN_HAS_CONSTEXPR 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index c94ed977e..eed0a9f05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 83ba1df71..21416afe0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -53,7 +53,7 @@ struct preserve_inner_most_dims { static const bool value = false; }; -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR template struct are_inner_most_dims{ static const bool value = indices_statically_known_to_increase()() && diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index d79a3ed45..c4d4f244f 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -11,6 +11,7 @@ #include +#ifdef EIGEN_HAS_CONSTEXPR static void test_static_index_list() { @@ -254,11 +255,14 @@ static void test_mixed_index_list() VERIFY_IS_APPROX(result3(0), expected); } +#endif void test_cxx11_tensor_index_list() { +#ifdef EIGEN_HAS_CONSTEXPR CALL_SUBTEST(test_static_index_list()); CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); +#endif } diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 5c3184833..0269853a9 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -284,7 +284,7 @@ static void test_static_dims() { Tensor out(72, 97); in.setRandom(); -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -314,7 +314,7 @@ static void test_innermost_last_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; @@ -345,7 +345,7 @@ static void test_innermost_first_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 2; reduction_axis[1] = 3; @@ -376,7 +376,7 @@ static void test_reduce_middle_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 2; -- cgit v1.2.3 From 2559fa9b0f20ea138cfb019d441ad1757221568d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Feb 2015 02:55:18 -0800 Subject: Fixed compilation error in the tensor broadcasting test --- unsupported/test/cxx11_tensor_broadcasting.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index f0792bdcf..2ddf47234 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -114,7 +114,15 @@ static void test_static_broadcasting() { Tensor tensor(8,3,5); tensor.setRandom(); + +#ifdef EIGEN_HAS_CONSTEXPR Eigen::IndexList, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; +#else + Eigen::array broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; +#endif Tensor broadcast; broadcast = tensor.broadcast(broadcasts); -- cgit v1.2.3 From 668518aed69c3d20efb480acd5944a79df7e5410 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:25:41 +0100 Subject: Fix non initialized entries and comparison of very small numbers --- unsupported/test/cxx11_tensor_contraction.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 6124818fd..2bcae90b8 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -389,7 +389,7 @@ static void test_matrix_vector() m_result = m_left * m_right; for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); } } @@ -399,6 +399,10 @@ static void test_tensor_vector() { Tensor t_left(7, 13, 17); Tensor t_right(1, 7); + + t_left.setRandom(); + t_right.setRandom(); + typedef typename Tensor::DimensionPair DimensionPair; Eigen::array dim_pair01{{{0, 1}}}; Tensor t_result = t_left.contract(t_right, dim_pair01); @@ -409,7 +413,7 @@ static void test_tensor_vector() Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); } } -- cgit v1.2.3 From c03c73c9b7032f984bcd6c52d9ca3a430ce19c69 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:26:12 +0100 Subject: Fix clang compilation --- unsupported/test/cxx11_tensor_thread_pool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e25912279..f49523683 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -15,6 +15,7 @@ #include using Eigen::Tensor; +using std::isnan; static void test_multithread_elementwise() { -- cgit v1.2.3 From 74e460b9950503ef5a306337a136e1d37795deae Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:26:24 +0100 Subject: Fix symmetric product --- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 21f8175d2..860e233b9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -374,7 +374,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Date: Fri, 6 Feb 2015 06:00:59 -0800 Subject: Fixed the cxx11_meta test --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 36d91e780..3a08628be 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; @@ -112,7 +112,7 @@ template struct get<0, type_lis template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; -template struct get<0, numeric_list> { constexpr static T value = a; }; +template struct get<0, numeric_list> { constexpr static int value = a; }; template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ -- cgit v1.2.3 From 91fe3a30043874e51225c8f25964687320c9b601 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 10:29:28 -0800 Subject: Removed a debug printf statement. --- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index a96d705a4..7e448f7c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -89,9 +89,6 @@ class TensorLayoutSwapOp : public TensorBase, WriteA EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) { - -std::cout << "In assignment operator " << std::endl; - typedef TensorAssignOp Assign; Assign assign(*this, other); internal::TensorExecutor::run(assign, DefaultDevice()); -- cgit v1.2.3 From 4716c2c6666eb7018dac2e2ed050ead45c8933e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:06:19 -0800 Subject: Fixed compilation error --- unsupported/test/cxx11_tensor_thread_pool.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index f49523683..6fe65c7f9 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -15,7 +15,7 @@ #include using Eigen::Tensor; -using std::isnan; + static void test_multithread_elementwise() { @@ -122,7 +122,7 @@ static void test_contraction_corner_cases() m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -137,7 +137,7 @@ static void test_contraction_corner_cases() new(&m_left) MapXf(t_left.data(), 32, 1); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -155,7 +155,7 @@ static void test_contraction_corner_cases() new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -173,7 +173,7 @@ static void test_contraction_corner_cases() new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); -- cgit v1.2.3 From 410895a7e4276fa2e1f78dbb953c7045818a86ae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:13:19 -0800 Subject: Silenced several compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 16 ++++++++-------- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 18 +++++++++--------- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 93938bd1b..a4f73b2a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -103,7 +103,7 @@ struct TensorEvaluator, Device> m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // The dimensions of the lhs and the rhs tensors should be equal to prevent // overflows and ensure the result is fully initialized. eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 503803d23..698bcfe18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -257,13 +257,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if ((Layout == ColMajor && m_dim.actualDim() == 0) || - (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) || - (Layout == RowMajor && m_dim.actualDim() == 0)) { + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -322,8 +322,8 @@ struct TensorEvaluator, Device> static const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) || - (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; @@ -333,8 +333,8 @@ struct TensorEvaluator, Device> this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) || - (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) { + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index af843654c..e750c21e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -499,9 +499,9 @@ struct TensorContractionEvaluatorBase // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedef typename internal::conditional< - Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; @@ -520,14 +520,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), m_result(NULL) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == - TensorEvaluator::Layout), + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); @@ -681,7 +681,7 @@ struct TensorContractionEvaluatorBase } // If the layout is RowMajor, we need to reverse the m_dimensions - if (Layout == RowMajor) { + if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { std::swap(m_dimensions[i], m_dimensions[j]); } @@ -855,9 +855,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index e358e6a3a..8b87f1045 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -79,9 +79,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 97f225f0a..5e167d4aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -94,14 +94,14 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { eigen_assert(m_data); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { return m_data[m_dims.IndexOfRowMajor(coords)]; } } - Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: Scalar* m_data; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 7e448f7c0..c00810594 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -112,7 +112,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; @@ -169,7 +169,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; -- cgit v1.2.3 From 114e863f086077fc949baf5dfe1f4102222c938e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:20:24 -0800 Subject: Silcenced a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index ef134adf2..5790e19d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device> m_dimensions[i] = input_dims[i] * broadcast[i]; } - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -139,7 +139,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return coeffColMajor(index); } else { return coeffRowMajor(index); @@ -210,7 +210,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } else { return packetRowMajor(index); @@ -326,7 +326,7 @@ struct TensorEvaluator, Device> } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e750c21e7..f7254a24d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -536,7 +536,7 @@ struct TensorContractionEvaluatorBase DSizes eval_left_dims; DSizes eval_right_dims; array, ContractDims> eval_op_indices; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { // For ColMajor, we keep using the existing dimensions for (int i = 0; i < LDims; i++) { eval_left_dims[i] = m_leftImpl.dimensions()[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 5e167d4aa..488d32cb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -167,7 +167,7 @@ struct TensorEvaluator #endif } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: const Scalar* m_data; @@ -218,7 +218,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const NullaryOp m_functor; @@ -273,7 +273,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const UnaryOp m_functor; @@ -301,7 +301,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); } @@ -337,7 +337,7 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const BinaryOp m_functor; @@ -413,7 +413,7 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 21416afe0..7643d4cdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -245,7 +245,7 @@ struct TensorEvaluator, Device> } // Precompute output strides. - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> // Precompute input strides. array input_strides; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { input_strides[0] = 1; for (int i = 1; i < NumInputDims; ++i) { input_strides[i] = input_strides[i-1] * input_dims[i-1]; @@ -309,7 +309,7 @@ struct TensorEvaluator, Device> Op reducer(m_reducer); if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); } else { @@ -330,7 +330,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; const Index firstIndex = firstInput(index); for (Index i = 0; i < packetSize; ++i) { Op reducer(m_reducer); @@ -339,7 +339,7 @@ struct TensorEvaluator, Device> } } else if (PreservingInnerMostDims) { const Index firstIndex = firstInput(index); - const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1; + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); @@ -371,7 +371,7 @@ struct TensorEvaluator, Device> // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { if (ReducingInnerMostDims) { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return index * m_preservedStrides[0]; } else { return index * m_preservedStrides[NumOutputDims - 1]; @@ -379,7 +379,7 @@ struct TensorEvaluator, Device> } // TBD: optimize the case where we preserve the innermost dimensions. Index startInput = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumOutputDims - 1; i > 0; --i) { // This is index_i in the output tensor. const Index idx = index / m_outputStrides[i]; -- cgit v1.2.3 From 057cfd2f02f06650db0634aca6abfbd09da36897 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:25:02 -0800 Subject: Silenced more compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 488d32cb4..d084880de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -85,7 +85,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { eigen_assert(m_data); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { return m_data[m_dims.IndexOfRowMajor(coords)]; @@ -158,7 +158,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { eigen_assert(m_data); - const Index index = (Layout == ColMajor) ? m_dims.IndexOfColMajor(coords) + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); #ifdef __CUDA_ARCH__ return __ldg(m_data+index); @@ -366,8 +366,8 @@ struct TensorEvaluator m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 87a4b0758..1191b2411 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -308,7 +308,7 @@ struct TensorEvaluator, Devi const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; @@ -348,7 +348,7 @@ struct TensorEvaluator, Devi m_impl.evalSubExprsIfNeeded(NULL); if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { contiguous_values *= dimensions()[i]; if (dimensions()[i] != m_impl.dimensions()[i]) { @@ -394,7 +394,7 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -446,7 +446,7 @@ struct TensorEvaluator, Devi Scalar* result = m_impl.data(); if (result) { Index offset = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (m_dimensions[i] != m_impl.dimensions()[i]) { offset += m_offsets[i] * m_inputStrides[i]; @@ -482,7 +482,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -547,7 +547,7 @@ struct TensorEvaluator, Device> const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 439cf3230..82969b4c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device { // Compute strides m_dimensions = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; @@ -138,7 +138,7 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { Index idx = index / m_strides[i]; index -= idx * m_strides[i]; -- cgit v1.2.3 From c21e45fbc5b82a2f99113e8b6ab0005ca01a7428 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:36:26 -0800 Subject: Fixed a few more compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index fb4e7fb11..57a14a037 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -111,7 +111,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(0 <= m_axis && m_axis < NumDims); const Dimensions& lhs_dims = m_leftImpl.dimensions(); @@ -131,7 +131,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { m_leftStrides[0] = 1; m_rightStrides[0] = 1; m_outputStrides[0] = 1; @@ -176,7 +176,7 @@ struct TensorEvaluator subs; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { subs[i] = index / m_outputStrides[i]; index -= subs[i] * m_outputStrides[i]; @@ -193,7 +193,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; @@ -209,7 +209,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 620a63ae7..1012ecd69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -126,7 +126,7 @@ struct TensorEvaluator, Device> array inputStrides; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -180,12 +180,12 @@ struct TensorEvaluator, Device> return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 5aa2c8d3b..00cb8e373 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -123,7 +123,7 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_outputStrides[0] = 1; m_inputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -172,7 +172,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -211,13 +211,13 @@ struct TensorEvaluator, Device> } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -281,7 +281,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; -- cgit v1.2.3 From 780b2422e2b3fd2b50121a6e5642c94b030fbf5b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:43:55 -0800 Subject: Silenced the last batch of compilation warnings triggered by gcc 4.8 --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 698bcfe18..dc9586cbc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -167,7 +167,7 @@ struct TensorEvaluator, Device> m_stride = 1; m_inputStride = 1; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < m_dim.actualDim(); ++i) { m_stride *= input_dims[i]; m_inputStride *= input_dims[i]; @@ -208,8 +208,8 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - if ((Layout == ColMajor && m_dim.actualDim() == 0) || - (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -220,8 +220,8 @@ struct TensorEvaluator, Device> } PacketReturnType rslt = internal::pload(values); return rslt; - } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims - 1) || - (Layout == RowMajor && m_dim.actualDim() == 0)) { + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index aecef3313..591fd2464 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -236,9 +236,9 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -339,7 +339,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 585ebc778..bf0e7edfb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -121,7 +121,7 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device) { // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -295,7 +295,7 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 9b14e01f4..2a7dd45c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -104,7 +104,7 @@ struct TensorEvaluator, Device m_dimensions[i] += m_padding[i].first + m_padding[i].second; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -141,7 +141,7 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { @@ -175,7 +175,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } return packetRowMajor(index); @@ -184,7 +184,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { Index inputIndex; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { const Index idx = coords[0]; if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { return Scalar(0); @@ -214,7 +214,7 @@ struct TensorEvaluator, Device return m_impl.coeff(inputIndex); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 1c03d202f..8a42ab6b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -100,7 +100,7 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device) { // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -232,7 +232,7 @@ struct TensorEvaluator, Device> } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; -- cgit v1.2.3 From fefec723aa44703c1b7884b2ccfa73877a58f500 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 13:16:22 -0800 Subject: Fixed compilation error triggered when trying to vectorize a non vectorizable cuda kernel. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 80 ++++++++++++++++------ 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93fdd907..05ac9bd2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -22,8 +22,13 @@ namespace Eigen { */ namespace internal { +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + // Default strategy: the expression is evaluated with a single cpu thread. -template::PacketAccess> +template::value> class TensorExecutor { public: @@ -153,34 +158,45 @@ class TensorExecutor template __global__ void __launch_bounds__(1024) - EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } - else { - // Use the vector path - const Index PacketSize = unpacket_traits::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } +} + +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_Vectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the vector path + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); } } -template -class TensorExecutor +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && TensorEvaluator::IsAligned; +}; + +template +class TensorExecutor { public: typedef typename Expression::Index Index; @@ -192,13 +208,33 @@ class TensorExecutor { const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } }; + #endif } // end namespace internal -- cgit v1.2.3 From f669f5656ab550010c5dd92ce2da7d3fab07babd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:29:47 -0800 Subject: Marked a few functions as EIGEN_DEVICE_FUNC to enable the use of tensors in cuda kernels. --- Eigen/src/Core/util/Memory.h | 48 +++++++++++----------- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 4 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index bacf236fb..16f8cc1b0 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = *** Implementation of generic aligned realloc (when no realloc can be used)*** *****************************************************************************/ -void* aligned_malloc(std::size_t size); -void aligned_free(void *ptr); +EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); +EIGEN_DEVICE_FUNC void aligned_free(void *ptr); /** \internal * \brief Reallocates aligned memory. @@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) *****************************************************************************/ #ifdef EIGEN_NO_MALLOC -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); } #elif defined EIGEN_RUNTIME_NO_MALLOC -inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) { static bool value = true; if (update == 1) value = new_value; return value; } -inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } -inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } #else -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size) } /** \internal Frees memory allocated with aligned_malloc. */ -inline void aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if !EIGEN_ALIGN std::free(ptr); @@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template inline void* conditional_aligned_malloc(size_t size) +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { return aligned_malloc(size); } -template<> inline void* conditional_aligned_malloc(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc(size_t size) } /** \internal Frees memory allocated with conditional_aligned_malloc */ -template inline void conditional_aligned_free(void *ptr) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { aligned_free(ptr); } -template<> inline void conditional_aligned_free(void *ptr) +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { std::free(ptr); } @@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void destruct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) { // always destruct an array starting from the end. if(ptr) @@ -351,7 +351,7 @@ template inline void destruct_elements_of_array(T *ptr, size_t size) /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template inline T* construct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) { size_t i; EIGEN_TRY @@ -371,7 +371,7 @@ template inline T* construct_elements_of_array(T *ptr, size_t size) *****************************************************************************/ template -EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) { if(size > size_t(-1) / sizeof(T)) throw_std_bad_alloc(); @@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template inline T* aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); @@ -396,7 +396,7 @@ template inline T* aligned_new(size_t size) } } -template inline T* conditional_aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); @@ -414,7 +414,7 @@ template inline T* conditional_aligned_new(size_t size) /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); aligned_free(ptr); @@ -423,13 +423,13 @@ template inline void aligned_delete(T *ptr, size_t size) /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void conditional_aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); conditional_aligned_free(ptr); } -template inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) { check_size_for_overflow(new_size); check_size_for_overflow(old_size); @@ -452,7 +452,7 @@ template inline T* conditional_aligned_realloc_new(T* pt } -template inline T* conditional_aligned_new_auto(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -495,7 +495,7 @@ template inline T* conditional_aligned_realloc_new_auto( return result; } -template inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) { if(NumTraits::RequireInitialization) destruct_elements_of_array(ptr, size); diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 0e8a4b8d6..037219f23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -350,7 +350,7 @@ class Tensor : public TensorBase > } #endif - void resize(const array& dimensions) + EIGEN_DEVICE_FUNC void resize(const array& dimensions) { std::size_t i; Index size = Index(1); @@ -367,7 +367,7 @@ class Tensor : public TensorBase > #endif } - void resize(const DSizes& dimensions) { + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { array dims; for (std::size_t i = 0; i < NumIndices; ++i) { dims[i] = dimensions[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index d81197e6d..2ad52b2f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -275,7 +275,7 @@ struct DSizes : array { } #endif - DSizes& operator = (const array& other) { + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { *static_cast(this) = other; return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index dfe85602a..1b227e8c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -112,9 +112,9 @@ class TensorStorage& dimensions() const {return m_dimensions;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const {return m_dimensions;} - void resize(DenseIndex size, const array& nbDimensions) + EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array& nbDimensions) { const DenseIndex currentSz = internal::array_prod(m_dimensions); if(size != currentSz) -- cgit v1.2.3 From 6620aaa4b3ad3ae9f38b7b6213e874021579bcd7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:34:42 -0800 Subject: Silenced a few compilation warnings generated by nvcc --- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 57a14a037..a1dec76d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -240,7 +240,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index a9501336e..41a36cb75 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - Scalar* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index c00810594..c119b30e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -148,7 +148,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 1191b2411..a93f48ccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -145,7 +145,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 7643d4cdc..de5747905 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -360,7 +360,7 @@ struct TensorEvaluator, Device> return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: template friend struct internal::GenericDimReducer; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 82969b4c0..ad21e966b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -190,7 +190,7 @@ struct TensorEvaluator, Device return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; -- cgit v1.2.3 From 4470c9997559522e9b81810948d9783b58444ae4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:40:18 -0800 Subject: Added a test to validate tensor casting on cuda devices --- unsupported/test/cxx11_tensor_cuda.cpp | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 059d23de1..8c1ca1bf8 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -460,6 +460,45 @@ static void test_cuda_constant_broadcast() } } + +void test_cuda_cast() +{ + Tensor in(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in.setRandom(); + + std::size_t in_bytes = in.size() * sizeof(double); + std::size_t out_bytes = out.size() * sizeof(float); + + double* d_in; + float* d_out; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in.template cast(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), static_cast(in(Eigen::array(i,j,k)))); + } + } + } +} + + void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -471,4 +510,5 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_convolution_2d()); CALL_SUBTEST(test_cuda_convolution_3d()); CALL_SUBTEST(test_cuda_constant_broadcast()); + CALL_SUBTEST(test_cuda_cast()); } -- cgit v1.2.3