aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11')
-rw-r--r--unsupported/Eigen/CXX11/Core15
-rw-r--r--unsupported/Eigen/CXX11/Tensor75
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h32
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h31
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h435
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md1446
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/Tensor.h306
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h164
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h573
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h341
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h363
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h258
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h992
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h1384
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h382
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h912
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h126
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h190
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h380
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h154
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h427
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h244
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h300
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h253
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h151
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h54
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h338
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIO.h53
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h382
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h419
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h70
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h86
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h198
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMap.h291
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h600
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h361
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h248
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h426
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h429
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h207
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h259
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h103
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h325
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h256
44 files changed, 14852 insertions, 187 deletions
diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core
index 4dc4ab224..292f09564 100644
--- a/unsupported/Eigen/CXX11/Core
+++ b/unsupported/Eigen/CXX11/Core
@@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,20 +22,26 @@
* module. Note that at this stage, you should not need to include
* this module directly.
*
+ * It also provides a limited fallback for compilers that don't support
+ * CXX11 yet, such as nvcc.
+ *
* \code
* #include <Eigen/CXX11/Core>
* \endcode
*/
-#include <array>
+#include <vector>
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+#if __cplusplus <= 199711L
+#include "src/Core/util/EmulateCXX11Meta.h"
+#else
+#include <array>
#include "src/Core/util/CXX11Workarounds.h"
#include "src/Core/util/CXX11Meta.h"
+#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#endif // EIGEN_CXX11_CORE_MODULE
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 049ce5596..34107ae71 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -1,6 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
@@ -10,9 +11,10 @@
#ifndef EIGEN_CXX11_TENSOR_MODULE
#define EIGEN_CXX11_TENSOR_MODULE
-#include <unsupported/Eigen/CXX11/Core>
+#include "Eigen/src/Core/util/StaticAssert.h"
+#include "unsupported/Eigen/CXX11/Core"
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
/** \defgroup CXX11_Tensor_Module Tensor Module
*
@@ -26,14 +28,69 @@
#include <cstddef>
#include <cstring>
+#include <stdint.h>
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
+#if __cplusplus > 199711
+#include <random>
+#endif
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
-#endif // EIGEN_CXX11_TENSOR_MODULE
+#ifdef EIGEN_USE_GPU
+#include <cuda_runtime.h>
+#if defined(__CUDACC__)
+#include <curand_kernel.h>
+#endif
+#endif
+
+#include "Eigen/Core"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 0e274b801..3a08628be 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -317,7 +317,7 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
template<typename Array, int... n>
constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
{
- return {{std_array_get<sizeof...(n) - n - 1>(arr)...}};
+ return {{array_get<sizeof...(n) - n - 1>(arr)...}};
}
template<typename T, std::size_t N>
@@ -335,9 +335,9 @@ constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
// an infinite loop)
template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
struct h_array_reduce {
- constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr)))
+ constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)))
{
- return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr));
+ return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr));
}
};
@@ -346,7 +346,7 @@ struct h_array_reduce<Reducer, T, N, 0>
{
constexpr static inline T run(std::array<T, N> arr)
{
- return std_array_get<0>(arr);
+ return array_get<0>(arr);
}
};
@@ -370,12 +370,20 @@ constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<
return array_reduce<product_op, T, N>(arr);
}
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+ eigen_assert(a.size() > 0);
+ t prod = 1;
+ for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+ return prod;
+}
+
/* zip an array */
template<typename Op, typename A, typename B, std::size_t N, int... n>
constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>)
{
- return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(std_array_get<n>(a), std_array_get<n>(b))... }};
+ return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
}
template<typename Op, typename A, typename B, std::size_t N>
@@ -387,9 +395,9 @@ constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<
/* zip an array and reduce the result */
template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...))
+constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
{
- return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...);
+ return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
}
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
@@ -403,7 +411,7 @@ constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N>
template<typename Op, typename A, std::size_t N, int... n>
constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>)
{
- return std::array<decltype(Op::run(A())),N>{{ Op::run(std_array_get<n>(a))... }};
+ return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
}
template<typename Op, typename A, std::size_t N>
@@ -415,9 +423,9 @@ constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A,
/* apply stuff to an array and reduce */
template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...))
+constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
{
- return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...);
+ return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
}
template<typename Reducer, typename Op, typename A, std::size_t N>
@@ -497,7 +505,3 @@ InstType instantiate_by_c_array(ArrType* arr)
} // end namespace Eigen
#endif // EIGEN_CXX11META_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index d71a67590..a590cf4e1 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -17,9 +17,6 @@
#error Intel Compiler only supports required C++ features since version 13.1.
// note that most stuff in principle works with 13.0 but when combining
// some features, at some point 13.0 will just fail with an internal assertion
-#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1))
-// note that it _should_ work with 3.1 but it was only tested with 3.2
-#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1.
#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
@@ -42,32 +39,46 @@
namespace Eigen {
+// Use std::array as Eigen array
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
namespace internal {
/* std::get is only constexpr in C++14, not yet in C++11
* - libstdc++ from version 4.7 onwards has it nevertheless,
* so use that
* - libstdc++ older versions: use _M_instance directly
- * - libc++ from version 3.4 onwards has it IF compiled with
- * -std=c++1y
- * - libc++ older versions or -std=c++11: use __elems_ directly
+ * - libc++ all versions so far: use __elems_ directly
* - all other libs: use std::get to be portable, but
* this may not be constexpr
*/
#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
#define STD_GET_ARR_HACK a._M_instance[I]
-#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11)
+#elif defined(_LIBCPP_VERSION)
#define STD_GET_ARR_HACK a.__elems_[I]
#else
#define STD_GET_ARR_HACK std::template get<I, T, N>(a)
#endif
-template<std::size_t I, class T, std::size_t N> constexpr inline T& std_array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T&& std_array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T const& std_array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+template<std::size_t I, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
#undef STD_GET_ARR_HACK
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+ static const size_t value = N;
+};
+
/* Suppose you have a template of the form
* template<typename T> struct X;
* And you want to specialize it in such a way:
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
new file mode 100644
index 000000000..494f95690
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -0,0 +1,435 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+#define EIGEN_EMULATE_CXX11_META_H
+
+
+
+namespace Eigen {
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed
+template <typename T, size_t n> class array {
+ public:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+
+ static const std::size_t size = n;
+
+ T values[n];
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array() { }
+ explicit EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v) {
+ EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+ EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+ EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+ const T& v4) {
+ EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5) {
+ EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6) {
+ EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6, const T& v7) {
+ EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ values[6] = v7;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(
+ const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6, const T& v7, const T& v8) {
+ EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ values[6] = v7;
+ values[7] = v8;
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ array(std::initializer_list<T> l) {
+ eigen_assert(l.size() == n);
+ std::copy(l.begin(), l.end(), values);
+ }
+#endif
+};
+
+
+namespace internal {
+
+/** \internal
+ * \file CXX11/Core/util/EmulateCXX11Meta.h
+ * This file emulates a subset of the functionality provided by CXXMeta.h for
+ * compilers that don't yet support cxx11 such as nvcc.
+ */
+
+struct empty_list { static const std::size_t count = 0; };
+
+template<typename T, typename Tail=empty_list> struct type_list {
+ typedef T HeadType;
+ typedef Tail TailType;
+ static const T head;
+ static const Tail tail;
+ static const std::size_t count = 1 + Tail::count;
+};
+
+struct null_type { };
+
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+ typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+ typename T7 = null_type, typename T8 = null_type>
+struct make_type_list {
+ typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
+
+ typedef type_list<T1, tailresult> type;
+};
+
+template<> struct make_type_list<> {
+ typedef empty_list type;
+};
+
+
+template <std::size_t index, class TList> struct get_type;
+
+template <class Head, class Tail>
+struct get_type<0, type_list<Head, Tail> >
+{
+ typedef Head type;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get_type<i, type_list<Head, Tail> >
+{
+ typedef typename get_type<i-1, Tail>::type type;
+};
+
+
+/* numeric list */
+template <typename T, T n>
+struct type2val {
+ typedef T type;
+ static const T value = n;
+};
+
+
+template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
+ typedef typename make_type_list<type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+ type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+ type2val<T, V>, type2val<T, V>, type2val<T, V>,
+ type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+ typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+ type2val<T, V>, type2val<T, V>, type2val<T, V>,
+ type2val<T, V>, type2val<T, V> >::type type;
+};
+
+
+template <std::size_t index, class NList> struct get;
+
+template <std::size_t i>
+struct get<i, empty_list>
+{
+ get() { eigen_assert(false && "index overflow"); }
+ typedef void type;
+ static const char value = '\0';
+};
+
+template <std::size_t i, class Head>
+struct get<i, type_list<Head, empty_list> >
+{
+ get() { eigen_assert(false && "index overflow"); }
+ typedef void type;
+ static const char value = '\0';
+};
+
+template <class Head>
+struct get<0, type_list<Head, empty_list> >
+{
+ typedef typename Head::type type;
+ static const type value = Head::value;
+};
+
+template <class Head, class Tail>
+struct get<0, type_list<Head, Tail> >
+{
+ typedef typename Head::type type;
+ static const type value = Head::value;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get<i, type_list<Head, Tail> >
+{
+ typedef typename Tail::HeadType::type type;
+ static const type value = get<i-1, Tail>::value;
+};
+
+
+template <class NList> struct arg_prod {
+ static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
+};
+template <> struct arg_prod<empty_list> {
+ static const int value = 1;
+};
+
+
+template<int n, typename t>
+array<t, n> repeat(t v) {
+ array<t, n> array;
+ array.fill(v);
+ return array;
+}
+
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
+ return get<I, type_list<Head, Tail> >::value;
+}
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
+ return get<I, type_list<Head, Tail> >::value;
+}
+
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+ return arg_prod<NList>::value;
+};
+
+template<std::size_t n, typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
+ t prod = 1;
+ for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
+ return prod;
+}
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
+ return 0;
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+ eigen_assert(a.size() > 0);
+ t prod = 1;
+ for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+ return prod;
+}
+
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+ return a[I];
+}
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+ return a[I];
+}
+
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+ return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+ return a[I];
+}
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+ static const size_t value = N;
+};
+
+struct sum_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
+};
+struct product_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
+};
+
+struct logical_and_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
+};
+struct logical_or_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
+};
+
+struct equal_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
+};
+struct not_equal_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
+};
+struct lesser_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
+};
+struct lesser_equal_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
+};
+
+struct greater_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
+};
+struct greater_equal_op {
+ template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
+};
+
+struct not_op {
+ template<typename A> static inline bool run(A a) { return !a; }
+};
+struct negation_op {
+ template<typename A> static inline bool run(A a) { return -a; }
+};
+struct greater_equal_zero_op {
+ template<typename A> static inline bool run(A a) { return a >= 0; }
+};
+
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+struct ArrayApplyAndReduce {
+ static inline bool run(const array<A, N>& a) {
+ EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
+ for (size_t i = 2; i < N; ++i) {
+ result = Reducer::run(result, Op::run(a[i]));
+ }
+ return result;
+ }
+};
+
+template<typename Reducer, typename Op, typename A>
+struct ArrayApplyAndReduce<Reducer, Op, A, 1> {
+ static inline bool run(const array<A, 1>& a) {
+ return Op::run(a[0]);
+ }
+};
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+inline bool array_apply_and_reduce(const array<A, N>& a) {
+ return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+struct ArrayZipAndReduce {
+ static inline bool run(const array<A, N>& a, const array<B, N>& b) {
+ EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
+ for (size_t i = 2; i < N; ++i) {
+ result = Reducer::run(result, Op::run(a[i], b[i]));
+ }
+ return result;
+ }
+};
+
+template<typename Reducer, typename Op, typename A, typename B>
+struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
+ static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
+ return Op::run(a[0], b[0]);
+ }
+};
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
+ return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+
+#endif // EIGEN_EMULATE_CXX11_META_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
new file mode 100644
index 000000000..6a4d52cc3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -0,0 +1,1446 @@
+# Eigen Tensors
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+[TOC]
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes. They all are in
+the namespace ```::Eigen.```
+
+
+### Class Tensor&lt;data_type, rank&gt;
+
+This is the class to use to create a tensor and allocate memory for it. The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank. The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable. For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size0, size1, ...)
+
+Constructor for a Tensor. The constructor must be passed ```rank``` integers
+indicating the sizes of the instance along each of the the ```rank```
+dimensions.
+
+ // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns
+ // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+ Tensor<float, 3> t_3d(2, 3, 4);
+
+ // Resize t_3d by assigning a tensor of different sizes, but same rank.
+ t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters. The array type to use is
+```Eigen::array&lt;Eigen::Index&gt;```. The array can be constructed automatically
+from an initializer list.
+
+ // Create a tensor of strings of rank 2 with sizes 5, 7.
+ Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize&lt;data_type, Sizes&lt;size0, size1, ...&gt;&gt;
+
+Class to use for tensors of fixed size, where the size is known at compile
+time. Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler. FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+ // Create a 4 x 3 tensor of floats.
+ TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code. It allows to view any piece of allocated
+memory as a Tensor. Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;(data, size0, size1, ...)
+
+Constructor for a Tensor. The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes. The storage has to be
+large enough to hold all the data.
+
+ // Map a tensor of ints on top of stack-allocated storage.
+ int storage[128]; // 2 x 4 x 2 x 8 = 128
+ TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+
+ // The same storage can be viewed as a different tensor.
+ // You can also pass the sizes as an array.
+ TensorMap<int, 2> t_2d(storage, 16, 8);
+
+ // You can also map fixed-size tensors. Here we get a 1d view of
+ // the 2d fixed-size tensor.
+ Tensor<float, Sizes<4, 5>> t_4x3;
+ TensorMap<float, 1> t_12(t_4x3, 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### &lt;data_type&gt; tensor(index0, index1...)
+
+Return the element at position ```(index0, index1...)``` in tensor
+```tensor```. You must pass as many parameters as the rank of ```tensor```.
+The expression can be used as an l-value to set the value of the element at the
+specified position. The value returned is of the datatype of the tensor.
+
+ // Set the value of the element at position (0, 1, 0);
+ Tensor<float, 3> t_3d(2, 3, 4);
+ t_3d(0, 1, 0) = 12.0f;
+
+ // Initialize all elements to random values.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 4; ++k) {
+ t_3d(i, j, k) = ...some random value...;
+ }
+ }
+ }
+
+ // Print elements of a tensor.
+ for (int i = 0; i < 2; ++i) {
+ LOG(INFO) << t_3d(i, 0, 0);
+ }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: ```ColMajor``` (the default) and
+```RowMajor```. Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+ Tensor<float, 3, ColMajor> col_major; // equivalent to Tensor<float, 3>
+ TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+```swap_layout()``` method. Note that this will also reverse the order of the
+dimensions.
+
+ Tensor<float, 2, ColMajor> col_major(2, 4);
+ Tensor<float, 2, RowMajor> row_major(2, 4);
+
+ Tensor<float, 2> col_major_result = col_major; // ok, layouts match
+ Tensor<float, 2> col_major_result = row_major; // will not compile
+
+ // Simple layout swap
+ col_major_result = row_major.swap_layout();
+ eigen_assert(col_major_result.dimension(0) == 4);
+ eigen_assert(col_major_result.dimension(1) == 2);
+
+ // Swap the layout and preserve the order of the dimensions
+ array<int, 2> shuffle(1, 0);
+ col_major_result = row_major.swap_layout().shuffle(shuffle);
+ eigen_assert(col_major_result.dimension(0) == 2);
+ eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc. These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads. For example
+the following code computes the elementwise addition of two tensors:
+
+ Tensor<float, 3> t1(2, 3, 4);
+ ...set some values in t1...
+ Tensor<float, 3> t2(2, 3, 4);
+ ...set some values in t2...
+ // Set t3 to the element wise sum of t1 and t2
+ Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression ```t1 + t2``` is not actually adding the values of the tensors. The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp&lt;scalar_sum&gt;, which has references to the tensors
+```t1``` and ```t2```. This is a small C++ object that knows how to add
+```t1``` and ```t2```. It is only when the value of the expression is assigned
+to the tensor ```t3``` that the addition is actually performed. Technically,
+this happens through the overloading of ```operator=()``` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression ```t1 + t2 *
+0.3f``` is actually represented with the (approximate) tree of operators:
+
+ TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ ```auto``` keyword
+does not have its intuitive meaning. Consider these 2 lines of code:
+
+ Tensor<float, 3> t3 = t1 + t2;
+ auto t4 = t1 + t2;
+
+In the first line we allocate the tensor ```t3``` and it will contain the
+result of the addition of ```t1``` and ```t2```. In the second line, ```t4```
+is actually the tree of tensor operators that will compute the addition of
+```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get
+the values of its elements:
+
+ Tensor<float, 3> t3 = t1 + t2;
+ cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+ auto t4 = t1 + t2;
+ cout << t4(0, 0, 0); // Compilation error!
+
+When you use ```auto``` you do not get a Tensor as a result but instead a
+non-evaluated expression. So only use ```auto``` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding onto them. This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory. All the following will work:
+
+ auto t4 = t1 + t2;
+
+ Tensor<float, 3> result = t4; // Could also be: result(t4);
+ cout << result(0, 0, 0);
+
+ TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+ cout << result(0, 0, 0);
+
+ TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+ cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations. As long as you keep the expression as an
+operation, no computation is performed.
+
+ // One way to compute exp((t1 + t2) * 0.2f);
+ auto t3 = t1 + t2;
+ auto t4 = t3 * 0.2f;
+ auto t5 = t4.exp();
+ Tensor<float, 3> result = t5;
+
+ // Another way, exactly as efficient as the previous one:
+ Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+* Assignment to a Tensor, TensorFixedSize, or TensorMap.
+* Use of the eval() method.
+* Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor. In
+the example below, the ```auto``` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor ```result``` causes the evaluation of all the
+operations.
+
+ auto t3 = t1 + t2; // t3 is an Operation.
+ auto t4 = t3 * 0.2f; // t4 is an Operation.
+ auto t5 = t4.exp(); // t5 is an Operation.
+ Tensor<float, 3> result = t5; // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+ // We know that the result is a 4x4x2 tensor!
+ TensorFixedSize<float, 4, 4, 2> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation. Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time. This is done by inserting a call to the ```eval()``` method of the
+expression Operation.
+
+ // The previous example could have been written:
+ Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+ // If you want to compute (t1 + t2) once ahead of time you can write:
+ Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling ```eval()``` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size. The code above in
+effect does:
+
+ // .eval() knows the size!
+ TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2;
+ Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of ```eval()``` is itself an Operation, so the
+following code does not do what you may think:
+
+ // Here t3 is an evaluation Operation. t3 has not been evaluated yet.
+ auto t3 = (t1 + t2).eval();
+
+ // You can use t3 in another expression. Still no evaluation.
+ auto t4 = (t3 * 0.2f).exp();
+
+ // The value is evaluated when you assign the Operation to a Tensor, using
+ // an intermediate tensor to represent t3.x
+ Tensor<float, 3> result = t4;
+
+While in the examples above calling ```eval()``` does not make a difference in
+performance, in other cases it can make a huge difference. In the expression
+below the ```broadcast()``` expression causes the ```X.maximum()``` expression
+to be evaluated many times:
+
+ Tensor<...> X ...;
+ Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+ * beta).exp();
+
+Inserting a call to ```eval()``` between the ```maximum()``` and
+```reshape()``` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+ Tensor<...> Y =
+ ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+ * beta).exp();
+
+In the other example below, the tensor ```Y``` is both used in the expression
+and its assignment. This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+ Tensor<...> Y ...;
+ Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()```
+expressions ensures that the sum is computed before any updates to ```Y``` are
+done.
+
+ Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of ```Y```
+then you would need to force an eval for correctness by adding an ```eval()```
+call for the right hand side:
+
+ Y.shuffle(...) =
+ (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation. It provides
+overloads for the ```()``` operator that let you access individual values in
+the expression. TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+ // Create a TensorRef for the expression. The expression is not
+ // evaluated yet.
+ TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+ // Use "ref" to access individual elements. The expression is evaluated
+ // on the fly.
+ float at_0 = ref(0, 0, 0);
+ cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access. However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef. But not always. So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions. The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda. Additional implementations may be added later.
+
+You can choose which implementation to use with the ```device()``` call. If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the
+library on ARM CPUs. Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+ Tensor<float, 2> a(30, 40);
+ Tensor<float, 2> b(30, 40);
+ Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a ```device()``` call
+before the assignment of the result. For technical C++ reasons this requires
+that the Tensor for the result be declared on its own. This means that you
+have to know the size of the result.
+
+ Eigen::Tensor<float, 2> c(30, 40);
+ c.device(...) = a + b;
+
+The call to ```device()``` must be the last call on the left of the operator=.
+
+You must pass to the ```device()``` call an Eigen device object. There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a ```device()``` call.
+
+ DefaultDevice my_device;
+ c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+ // Create the Eigen ThreadPoolDevice.
+ Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
+
+ // Now just use the device when evaluating expressions.
+ Eigen::Tensor<float, 2> c(30, 50);
+ c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### &lt;Tensor-Type&gt;::Dimensions
+
+Acts like an array of ints. Has an ```int size``` attribute, and can be
+indexed like an array to access individual values. Used to represent the
+dimensions of a tensor. See ```dimensions()```.
+
+#### &lt;Tensor-Type&gt;::Index
+
+Acts like an ```int```. Used for indexing tensors along their dimensions. See
+```operator()```, ```dimension()```, and ```size()```.
+
+#### &lt;Tensor-Type&gt;::Scalar
+
+Represents the datatype of individual tensor elements. For example, for a
+```Tensor<float>```, ```Scalar``` is the type ```float```. See
+```setConstant()```.
+
+#### &lt;Operation&gt;
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method. We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor. You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately. They are not
+Operations which provide delayed evaluation of their results. Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor. This is also
+known as the tensor "rank".
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ cout << "Dims " << a.NumDimensions;
+ => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+ cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+ << ", dim 1: " << d[1];
+ => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use ```auto``` to simplify the code:
+
+ const auto& d = a.dimensions();
+ cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+ << ", dim 1: " << d[1];
+ => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor. The actual type of the
+```dimension()``` result is ```<Tensor-Type>::Index```, but you can
+always use it like an int.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ int dim1 = a.dimension(1);
+ cout << "Dim 1: " << dim1;
+ => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor. This is the product of all
+the tensor dimensions. The actual type of the ```size()``` result is
+```<Tensor-Type>::Index```, but you can always use it like an int.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ cout << "Size: " << a.size();
+ => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide ```dimensions()``` directly,
+e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions
+until the operation is being evaluated. If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides ```dimensions()``` and ```dimension()``` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor). In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors and Copies
+
+TODO.
+
+ Tensor(...)
+ TensorFixedSize(...)
+ TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions)
+ TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+ TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+ Self& operator=(const Self& other)
+ Self& operator=(const OtherDerived& other)
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized. Similarly,
+when a new TensorMap is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory. These
+have an immediate effect on the tensor and return the tensor itself as a
+result. These are not tensor Operations which delay evaluation.
+
+### &lt;Tensor-Type&gt; setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value ```val```. ```Scalar```
+is the type of data stored in the tensor. You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+ a.setConstant(12.3f);
+ cout << "Constant: " << endl << a << endl << endl;
+ =>
+ Constant:
+ 12.3 12.3 12.3 12.3
+ 12.3 12.3 12.3 12.3
+ 12.3 12.3 12.3 12.3
+
+Note that ```setConstant()``` can be used on any tensor where the element type
+has a copy constructor and an ```operator=()```:
+
+ Eigen::Tensor<string, 2> a(2, 3);
+ a.setConstant("yolo");
+ cout << "String tensor: " << endl << a << endl << endl;
+ =>
+ String tensor:
+ yolo yolo yolo
+ yolo yolo yolo
+
+
+### &lt;Tensor-Type&gt; setZero()
+
+Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```.
+Returns the tensor itself in case you want to chain another call.
+
+ a.setZero();
+ cout << "Zeros: " << endl << a << endl << endl;
+ =>
+ Zeros:
+ 0 0 0 0
+ 0 0 0 0
+ 0 0 0 0
+
+
+### &lt;Tensor-Type&gt; setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times. The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a ```TensorFixedSize<float, 2, 3>``` the initializer list must
+contains 2 lists of 3 floats each.
+
+```setValues()``` returns the tensor itself in case you want to chain another
+call.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+ cout << "a" << endl << a << endl << endl;
+ =>
+ a
+ 0 1 2
+ 3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed. This is valid at each level of nesting. For example the following
+code only sets the values of the first row of the tensor.
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setConstant(1000);
+ a.setValues({{10, 20, 30}});
+ cout << "a" << endl << a << endl << endl;
+ =>
+ a
+ 10 20 30
+ 1000 1000 1000
+
+### &lt;Tensor-Type&gt; setRandom()
+
+Fills the tensor with random values. Returns the tensor itself in case you
+want to chain another call.
+
+ a.setRandom();
+ cout << "Random: " << endl << a << endl << endl;
+ =>
+ Random:
+ 0.680375 0.59688 -0.329554 0.10794
+ -0.211234 0.823295 0.536459 -0.0452059
+ 0.566198 -0.604897 -0.444451 0.257742
+
+You can customize ```setRandom()``` by providing your own random number
+generator as a template argument:
+
+ a.setRandom<MyRandomGenerator>();
+
+Here, ```MyRandomGenerator``` must be a struct with the following member
+functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar```
+and ```<Tensor-Type>::Index```.
+
+See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example.
+
+ // Custom number generator for use with setRandom().
+ struct MyRandomGenerator {
+ // Default and copy constructors. Both are needed
+ MyRandomGenerator() { }
+ MyRandomGenerator(const MyRandomGenerator& ) { }
+
+ // Return a random value to be used. "element_location" is the
+ // location of the entry to set in the tensor, it can typically
+ // be ignored.
+ Scalar operator()(Eigen::DenseIndex element_location,
+ Eigen::DenseIndex /*unused*/ = 0) const {
+ return <randomly generated value of type T>;
+ }
+
+ // Same as above but generates several numbers at a time.
+ typename internal::packet_traits<Scalar>::type packetOp(
+ Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+ return <a packet of randomly generated values>;
+ }
+ };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+* UniformRandomGenerator
+* NormalRandomGenerator
+
+
+## Data Access
+
+TODO
+
+ const Scalar& operator()(const array<Index, NumIndices>& indices)
+ const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ Scalar& operator()(const array<Index, NumIndices>& indices)
+ Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ Scalar& operator[](Index index)
+ ??? mention coeff() and coeffRef() ???
+
+### Scalar* data()
+### const Scalar* data() const
+
+Returns a pointer to the storage for the tensor. The pointer is const if the
+tensor was const. This allows direct access to the data. The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ float* a_data = a.data();
+ a_data[0] = 123.45f;
+ cout << "a(0, 0): " << a(0, 0);
+ => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor ```Operations```.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor. See "Controlling when Expression are Evaluated" for more details about
+their evaluation.
+
+### &lt;Operation&gt; constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value ```val```.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+ Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ cout << "c" << endl << c << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ 3 3 3
+ 3 3 3
+
+ c
+ 0.6 0.6 0.6
+ 0.6 0.6 0.6
+
+### &lt;Operation&gt; random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for ```setRandom()```.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = a + a.random();
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ 1.68038 1.5662 1.82329
+ 0.788766 1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied. The
+requested operations are applied to each element independently.
+
+### &lt;Operation&gt; operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = -a;
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ -1 -1 -1
+ -1 -1 -1
+
+### &lt;Operation&gt; sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### &lt;Operation&gt; rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### &lt;Operation&gt; square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### &lt;Operation&gt; inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### &lt;Operation&gt; exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### &lt;Operation&gt; log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### &lt;Operation&gt; abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### &lt;Operation&gt; pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients. For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction. For example this computes
+cubic roots of an int Tensor:
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 1, 8}, {27, 64, 125}});
+ Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 0 1 8
+ 27 64 125
+
+ b
+ 0 1 2
+ 3 4 5
+
+### &lt;Operation&gt; operator * (Scalar scale)
+TODO
+
+### &lt;Operation&gt; cwiseMax(Scalar threshold)
+TODO
+
+### &lt;Operation&gt; cwiseMin(Scalar threshold)
+TODO
+
+ ### &lt;Operation&gt; unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### &lt;Operation&gt; operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### &lt;Operation&gt; operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### &lt;Operation&gt; operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### &lt;Operation&gt; operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### &lt;Operation&gt; cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### &lt;Operation&gt; cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### &lt;Operation&gt; Logical operators
+
+The following logical operators are supported as well:
+
+* operator&&(const OtherDerived& other)
+
+* operator||(const OtherDerived& other)
+
+* operator<(const OtherDerived& other)
+
+* operator<=(const OtherDerived& other)
+
+* operator>(const OtherDerived& other)
+
+* operator>=(const OtherDerived& other)
+
+* operator==(const OtherDerived& other)
+
+* operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+ Tensor<bool, 3> if = ...;
+ Tensor<float, 3> then = ...;
+ Tensor<float, 3> else = ...;
+ Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result. The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contractions
+
+TODO
+ contract(const OtherDerived& other, const Dimensions& dims)
+
+
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor. The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor. You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as ```maximum()``` and ```sum()``` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+```<TensorType>::Dimensions``` which can always be specified as an array of
+ints. These are called the "reduction dimensions." The values are the indices
+of the dimensions of the input tensor over which the reduction is done. The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+ // Create a tensor of 3 dimensions: 2, 3, 4
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {6, 5, 4}});
+ // Reduce it along the second dimension (1)...
+ Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+ // ...using the "maximum" operator.
+ // The result is a tensor with one dimension. The size of
+ // that dimension is the same as the first (non-reduced) dimension of a.
+ Eigen::Tensor<int, 1> b = a.maximum(dims);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 2 3
+ 6 5 4
+
+ b
+ 3
+ 6
+
+Example: Reduction along two dimensions.
+
+ Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+ a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+ {7.0f, 6.0f, 5.0f, 4.0f},
+ {8.0f, 9.0f, 10.0f, 11.0f}},
+ {{12.0f, 13.0f, 14.0f, 15.0f},
+ {19.0f, 18.0f, 17.0f, 16.0f},
+ {20.0f, 21.0f, 22.0f, 23.0f}}});
+ // The tensor a has 3 dimensions. We reduce along the
+ // first 2, resulting in a tensor with a single dimension
+ // of size 4 (the last dimension of a.)
+ // Note that we pass the array of reduction dimensions
+ // directly to the maximum() call.
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+ a.maximum(Eigen::array<int, 2>({0, 1}));
+ cout << "b" << endl << b << endl << endl;
+ =>
+ b
+ 20
+ 21
+ 22
+ 23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions. The result is a
+one-dimension tensor with a single value.
+
+ Eigen::Tensor<float, 3> a(2, 3, 4);
+ a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+ {7.0f, 6.0f, 5.0f, 4.0f},
+ {8.0f, 9.0f, 10.0f, 11.0f}},
+ {{12.0f, 13.0f, 14.0f, 15.0f},
+ {19.0f, 18.0f, 17.0f, 16.0f},
+ {20.0f, 21.0f, 22.0f, 23.0f}}});
+ // Reduce along all dimensions using the sum() operator.
+ Eigen::Tensor<float, 1> b = a.sum();
+ cout << "b" << endl << b << endl << endl;
+ =>
+ b
+ 276
+
+
+### &lt;Operation&gt; sum(const Dimensions& new_dims)
+### &lt;Operation&gt; sum()
+
+Reduce a tensor using the sum() operator. The resulting values
+are the sum of the reduced values.
+
+### &lt;Operation&gt; mean(const Dimensions& new_dims)
+### &lt;Operation&gt; mean()
+
+Reduce a tensor using the mean() operator. The resulting values
+are the mean of the reduced values.
+
+### &lt;Operation&gt; maximum(const Dimensions& new_dims)
+### &lt;Operation&gt; maximum()
+
+Reduce a tensor using the maximum() operator. The resulting values are the
+largest of the reduced values.
+
+### &lt;Operation&gt; minimum(const Dimensions& new_dims)
+### &lt;Operation&gt; minimum()
+
+Reduce a tensor using the minimum() operator. The resulting values
+are the smallest of the reduced values.
+
+### &lt;Operation&gt; prod(const Dimensions& new_dims)
+### &lt;Operation&gt; prod()
+
+Reduce a tensor using the prod() operator. The resulting values
+are the product of the reduced values.
+
+### &lt;Operation&gt; reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator. See ```SumReducer```
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Convolutions
+
+TBD: convolve(const KernelDerived& kernel, const Dimensions& dims)
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor. They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### &lt;Operation&gt; reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions. The argument new_dims is an array of Index values. The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+ // Increase the rank of the input tensor by introducing a new dimension
+ // of size 1.
+ Tensor<float, 2> input(7, 11);
+ array<int, 3> three_dims{{7, 11, 1}};
+ Tensor<float, 3> result = input.reshape(three_dims);
+
+ // Decrease the rank of the input tensor by merging 2 dimensions;
+ array<int, 1> one_dim{{7 * 11}};
+ Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you ```reshape()``` a 2D ColMajor tensor
+to one dimension:
+
+ Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 300
+ 100
+ 400
+ 200
+ 500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+ Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+ Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 100
+ 200
+ 300
+ 400
+ 500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+ Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+ b.reshape(two_dim) = a;
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 300
+ 100
+ 400
+ 200
+ 500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### &lt;Operation&gt; shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+ // Shuffle all dimensions to the left by 1.
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+ eigen_assert(output.dimension(0) == 30);
+ eigen_assert(output.dimension(1) == 50);
+ eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+ eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+ eigen_assert(output(..., indices[shuffle[i]], ...) ==
+ input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+ // Shuffle all dimensions to the left by 1.
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output(30, 50, 20);
+ output.shuffle({2, 0, 1}) = input;
+
+
+### &lt;Operation&gt; stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions. The argument strides is an
+array of Index values. The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you ```stride()``` a 2D tensor:
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+ Eigen::Tensor<int, 2> b = a.stride(strides);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0 200
+ 900 1100
+
+It is possible to assign a tensor to a stride:
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output(40, 90, 200);
+ output.stride({2, 3, 4}) = input;
+
+
+### &lt;Operation&gt; slice(const StartIndices& startIndices,
+ const Sizes& sizes)
+
+TBD
+
+
+### &lt;Operation&gt; chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500},
+ {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+ Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+ cout << "a" << endl << a << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ 600 700 800
+ 900 1000 1100
+ cout << "row_3" << endl << row_3 << endl;
+ =>
+ row_3
+ 600 700 800
+ cout << "col_2" << endl << col_2 << endl;
+ =>
+ col_2
+ 100 400 700 1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+ Eigen::Tensor<int, 1> a(3);
+ a.setValues({{100, 200, 300}});
+ Eigen::Tensor<int, 2> b(2, 3);
+ b.setZero();
+ b.chip(0, 0) = a;
+ cout << "a" << endl << a << endl;
+ =>
+ a
+ 100
+ 200
+ 300
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 100 200 300
+ 0 0 0
+
+
+### &lt;Operation&gt; reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions. The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions. This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you ```reverse()``` the first dimension
+of a 2D tensor:
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500},
+ {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::array<bool, 2> reverse({true, false});
+ Eigen::Tensor<int, 2> b = a.reverse(reverse);
+ cout << "a" << endl << a << endl << "b" << endl << b << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ 600 700 800
+ 900 1000 1100
+ b
+ 900 1000 1100
+ 600 700 800
+ 300 400 500
+ 0 100 200
+
+
+TODO
+### &lt;Operation&gt; broadcast(const Broadcast& broadcast)
+
+TODO
+
+### &lt;Operation&gt; concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### &lt;Operation&gt; pad(const PaddingDimensions& padding)
+
+TODO
+
+### &lt;Operation&gt; extract_patches(const PatchDims& patch_dims)
+
+TODO
+
+### &lt;Operation&gt; extract_image_patches(const Index patch_rows, const Index patch_cols,
+ const Index row_stride, const Index col_stride,
+ const PaddingType padding_type)
+
+TODO
+
+
+## Special Operations
+
+### &lt;Operation&gt; cast&lt;T&gt;()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers. This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 1, 2}, {3, 4, 5}});
+ Eigen::Tensor<int, 2> b =
+ (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 0 1 2
+ 3 4 5
+
+ b
+ 0 0 1
+ 1 2 2
+
+
+### &lt;Operation&gt; eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor&lt;T, N&gt;::maximum() currently returns a Tensor&lt;T, 1&gt;. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
+
+## Limitations
+
+* The number of tensor dimensions is currently limited to 250 when using a
+ compiler that supports cxx11. It is limited to only 5 for older compilers.
+* The IndexList class requires a cxx11 compliant compiler. You can use an
+ array of indices instead if you don't have access to a modern compiler.
+* TensorVarDims are only partially supported
+* On GPUs only floating point values are properly tested and optimized for.
+* Complex and integer values are known to be broken on GPUs. If you try to use
+ them you'll most likely end up triggering a static assertion failure such as
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 70ca1433f..037219f23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -1,6 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
@@ -55,70 +56,46 @@ namespace Eigen {
* change dramatically.</dd>
* </dl>
*
- * \ref TopicStorageOrders
+ * \ref TopicStorageOrders
*/
-template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0>
-class Tensor;
-namespace internal {
template<typename Scalar_, std::size_t NumIndices_, int Options_>
-struct traits<Tensor<Scalar_, NumIndices_, Options_>>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
{
- typedef Scalar_ Scalar;
- typedef Dense StorageKind;
- typedef DenseIndex Index;
- enum {
- Options = Options_
- };
-};
-
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
-struct tensor_index_linearization_helper
-{
- constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const& dimensions)
- {
- return std_array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
- std_array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
- tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
- }
-};
-
-template<typename Index, std::size_t NumIndices, bool RowMajor>
-struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
-{
- constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const&)
- {
- return std_array_get<RowMajor ? 0 : NumIndices - 1>(indices);
- }
-};
-} // end namespace internal
-
-template<typename Scalar_, std::size_t NumIndices_, int Options_>
-class Tensor
-{
- static_assert(NumIndices_ >= 1, "A tensor must have at least one index.");
-
public:
typedef Tensor<Scalar_, NumIndices_, Options_> Self;
+ typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_> > Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<Self>::StorageKind StorageKind;
typedef typename internal::traits<Self>::Index Index;
- typedef typename internal::traits<Self>::Scalar Scalar;
- typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+ typedef Scalar_ Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
- typedef Self DenseType;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+ typedef typename Base::PacketReturnType PacketReturnType;
+
+ enum {
+ IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign),
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+ CoordAccess = true,
+ };
- constexpr static int Options = Options_;
- constexpr static std::size_t NumIndices = NumIndices_;
+ static const int Options = Options_;
+ static const std::size_t NumIndices = NumIndices_;
+ typedef DSizes<Index, NumIndices_> Dimensions;
protected:
TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage;
public:
- EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
- EIGEN_STRONG_INLINE std::array<Index, NumIndices> dimensions() const { return m_storage.dimensions(); }
- EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); }
- EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
- EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
+ // Metadata
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const { return m_storage.dimensions(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
// work, because that uses base().coeffRef() - and we don't yet
@@ -126,146 +103,254 @@ class Tensor
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
- void setZero()
- {
- // FIXME: until we have implemented packet access and the
- // expression engine w.r.t. nullary ops, use this
- // as a kludge. Only works with POD types, but for
- // any standard usage, this shouldn't be a problem
- memset((void *)data(), 0, size() * sizeof(Scalar));
- }
-
- inline Self& operator=(Self const& other)
- {
- m_storage = other.m_storage;
- return *this;
- }
-
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
- static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return coeff(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
+#endif
- inline const Scalar& coeff(const std::array<Index, NumIndices>& indices) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
- inline const Scalar& coeff(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
- static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return coeffRef(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
+#endif
- inline Scalar& coeffRef(const std::array<Index, NumIndices>& indices)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
- inline Scalar& coeffRef(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
- static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return this->operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+ {
+ return coeff(array<Index, 2>(i0, i1));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+ {
+ return coeff(array<Index, 3>(i0, i1, i2));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ return coeff(array<Index, 4>(i0, i1, i2, i3));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
}
+#endif
- inline const Scalar& operator()(const std::array<Index, NumIndices>& indices) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
eigen_assert(checkIndexRange(indices));
return coeff(indices);
}
- inline const Scalar& operator()(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return coeff(index);
}
- inline const Scalar& operator[](Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
{
- static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
+ // The bracket operator is only for vectors, use the parenthesis operator instead.
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff(index);
}
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
- static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+ {
+ return coeffRef(array<Index, 2>(i0, i1));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+ {
+ return coeffRef(array<Index, 3>(i0, i1, i2));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ return coeffRef(array<Index, 4>(i0, i1, i2, i3));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
}
+#endif
- inline Scalar& operator()(const std::array<Index, NumIndices>& indices)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
eigen_assert(checkIndexRange(indices));
return coeffRef(indices);
}
- inline Scalar& operator()(Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
- inline Scalar& operator[](Index index)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
{
- static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
+ // The bracket operator is only for vectors, use the parenthesis operator instead
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(index);
}
- inline Tensor()
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor()
: m_storage()
{
}
- inline Tensor(const Self& other)
- : m_storage(other.m_storage)
- {
- }
-
- inline Tensor(Self&& other)
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const Self& other)
: m_storage(other.m_storage)
{
}
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
- : m_storage()
+ : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+ {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#else
+ inline explicit Tensor(Index dim1)
+ : m_storage(dim1, array<Index, 1>(dim1))
+ {
+ EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ inline explicit Tensor(Index dim1, Index dim2)
+ : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
{
- static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor.");
- resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+ EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
+ inline explicit Tensor(Index dim1, Index dim2, Index dim3)
+ : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
+ {
+ EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+ : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
+ {
+ EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+ : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
+ {
+ EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#endif
- inline Tensor(std::array<Index, NumIndices> dimensions)
- : m_storage(internal::array_prod(dimensions), dimensions)
+ inline explicit Tensor(const array<Index, NumIndices>& dimensions)
+ : m_storage(internal::array_prod(dimensions), dimensions)
{
EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
+ {
+ typedef TensorAssignOp<Tensor, const Tensor> Assign;
+ Assign assign(*this, other);
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
void resize(Index firstDimension, IndexTypes... otherDimensions)
{
- static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor.");
- resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+ // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
}
+#endif
- void resize(const std::array<Index, NumIndices>& dimensions)
+ EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
{
std::size_t i;
Index size = Index(1);
@@ -282,8 +367,17 @@ class Tensor
#endif
}
+ EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
+ array<Index, NumIndices> dims;
+ for (std::size_t i = 0; i < NumIndices; ++i) {
+ dims[i] = dimensions[i];
+ }
+ resize(dims);
+ }
+
protected:
- bool checkIndexRange(const std::array<Index, NumIndices>& indices) const
+
+ bool checkIndexRange(const array<Index, NumIndices>& indices) const
{
using internal::array_apply_and_reduce;
using internal::array_zip_and_reduce;
@@ -298,16 +392,16 @@ class Tensor
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
}
- inline Index linearizedIndex(const std::array<Index, NumIndices>& indices) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
{
- return internal::tensor_index_linearization_helper<Index, NumIndices, NumIndices - 1, Options&RowMajor>::run(indices, m_storage.dimensions());
+ if (Options&RowMajor) {
+ return m_storage.dimensions().IndexOfRowMajor(indices);
+ } else {
+ return m_storage.dimensions().IndexOfColMajor(indices);
+ }
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 000000000..a4f73b2a1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,164 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+namespace Eigen {
+
+/** \class TensorAssign
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor assignment class.
+ *
+ * This class is represents the assignment of the values resulting from the evaluation of
+ * the rhs expression to the memory locations denoted by the lhs expression.
+ */
+namespace internal {
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+ typedef typename LhsXprType::Scalar Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename traits<LhsXprType>::StorageKind StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+ static const int Layout = internal::traits<LhsXprType>::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorAssignOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+ typedef typename LhsXprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+ const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
+};
+
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
+{
+ typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+ m_leftImpl(op.lhsExpression(), device),
+ m_rightImpl(op.rhsExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ // The dimensions of the lhs and the rhs tensors should be equal to prevent
+ // overflows and ensure the result is fully initialized.
+ eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use left impl instead if right impl dimensions are known at compile time.
+ return m_rightImpl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+ // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+ // evaluation isn't supported and the caller still needs to manually assign the values generated
+ // by the rhs to the lhs.
+ return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+ const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+ const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+ m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+ }
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_leftImpl.coeff(index);
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+ {
+ return m_leftImpl.template packet<LoadMode>(index);
+ }
+
+ private:
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 000000000..e08ac6aa1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,573 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+namespace Eigen {
+
+/** \class TensorBase
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor base class.
+ *
+ * This class is the common parent of the Tensor and TensorMap class, thus
+ * making it possible to use either class interchangably in expressions.
+ */
+
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+ public:
+ typedef internal::traits<Derived> DerivedTraits;
+ typedef typename DerivedTraits::Scalar Scalar;
+ typedef typename DerivedTraits::Index Index;
+ typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
+ typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+ static const int NumDimensions = DerivedTraits::NumDimensions;
+
+ // Generic nullary operation support.
+ template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+ nullaryExpr(const CustomNullaryOp& func) const {
+ return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+ }
+
+ // Coefficient-wise nullary operators
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+ constant(const Scalar& value) const {
+ return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+ random() const {
+ return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+ }
+ template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+ random() const {
+ return nullaryExpr(RandomGenerator());
+ }
+
+ // Generic unary operation support.
+ template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+ unaryExpr(const CustomUnaryOp& func) const {
+ return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+ }
+
+ // Coefficient-wise unary operators
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+ operator-() const {
+ return unaryExpr(internal::scalar_opposite_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+ sqrt() const {
+ return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+ square() const {
+ return unaryExpr(internal::scalar_square_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+ cube() const {
+ return unaryExpr(internal::scalar_cube_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+ inverse() const {
+ return unaryExpr(internal::scalar_inverse_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+ exp() const {
+ return unaryExpr(internal::scalar_exp_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+ log() const {
+ return unaryExpr(internal::scalar_log_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+ abs() const {
+ return unaryExpr(internal::scalar_abs_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+ pow(Scalar exponent) const {
+ return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+ operator+ (Scalar rhs) const {
+ return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+ operator- (Scalar rhs) const {
+ EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+ operator* (Scalar rhs) const {
+ return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+ operator/ (Scalar rhs) const {
+ // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ cwiseMax(Scalar threshold) const {
+ return cwiseMax(constant(threshold));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ cwiseMin(Scalar threshold) const {
+ return cwiseMin(constant(threshold));
+ }
+
+ template <typename NewType> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived>
+ cast() const {
+ return unaryExpr(internal::scalar_cast_op<Scalar, NewType>());
+ }
+
+ // Generic binary operation support.
+ template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+ binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+ return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+ }
+
+ // Coefficient-wise binary operators.
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+ operator+(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+ operator-(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+ operator*(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+ operator/(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+ cwiseMax(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+ cwiseMin(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+ operator&&(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+ operator||(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+ }
+
+ // Comparisons and tests.
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived>
+ operator<(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::less<Scalar>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived>
+ operator<=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::less_equal<Scalar>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived>
+ operator>(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::greater<Scalar>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived>
+ operator>=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::greater_equal<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+ operator==(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::equal_to<Scalar>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+ operator!=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), std::not_equal_to<Scalar>());
+ }
+
+ // Coefficient-wise ternary operators.
+ template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+ select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+ return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+ }
+
+ // Contractions.
+ typedef Eigen::IndexPair<Index> DimensionPair;
+
+ template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+ contract(const OtherDerived& other, const Dimensions& dims) const {
+ return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+ }
+
+ // Convolutions.
+ template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+ convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+ return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+ }
+
+ // Reductions.
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+ sum(const Dims& dims) const {
+ return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+ sum() const {
+ array<Index, NumDimensions> in_dims;
+ for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+ return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+ mean(const Dims& dims) const {
+ return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+ mean() const {
+ array<Index, NumDimensions> in_dims;
+ for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+ return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+ prod(const Dims& dims) const {
+ return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+ prod() const {
+ array<Index, NumDimensions> in_dims;
+ for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+ return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+ maximum(const Dims& dims) const {
+ return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+ maximum() const {
+ array<Index, NumDimensions> in_dims;
+ for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+ return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+ minimum(const Dims& dims) const {
+ return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+ minimum() const {
+ array<Index, NumDimensions> in_dims;
+ for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+ return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+ }
+
+ template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<Reducer, const Dims, const Derived>
+ reduce(const Dims& dims, const Reducer& reducer) const {
+ return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+ }
+
+ template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorBroadcastingOp<const Broadcast, const Derived>
+ broadcast(const Broadcast& broadcast) const {
+ return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+ }
+
+ template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+ concatenate(const OtherDerived& other, Axis axis) const {
+ return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+ }
+
+ template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorPatchOp<const PatchDims, const Derived>
+ extract_patches(const PatchDims& patch_dims) const {
+ return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+ }
+
+ template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Rows, Cols, const Derived>
+ extract_image_patches() const {
+ return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, PADDING_SAME);
+ }
+
+ template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Rows, Cols, const Derived>
+ extract_image_patches(const PaddingType padding_type) const {
+ return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, padding_type);
+ }
+
+ template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Rows, Cols, const Derived>
+ extract_image_patches(const Index stride, const PaddingType padding_type) const {
+ return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, padding_type);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+ extract_image_patches(const Index patch_rows, const Index patch_cols,
+ const Index row_stride = 1, const Index col_stride = 1) const {
+ return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+ PADDING_SAME);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+ extract_image_patches(const Index patch_rows, const Index patch_cols,
+ const Index row_stride, const Index col_stride,
+ const PaddingType padding_type) const {
+ return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+ padding_type);
+ }
+
+ // Morphing operators.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorLayoutSwapOp<const Derived>
+ swap_layout() const {
+ return TensorLayoutSwapOp<const Derived>(derived());
+ }
+ template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReshapingOp<const NewDimensions, const Derived>
+ reshape(const NewDimensions& newDimensions) const {
+ return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+ }
+ template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+ slice(const StartIndices& startIndices, const Sizes& sizes) const {
+ return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+ }
+ template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<DimId, const Derived>
+ chip(const Index offset) const {
+ return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<Dynamic, const Derived>
+ chip(const Index offset, const Index dim) const {
+ return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+ }
+ template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReverseOp<const ReverseDimensions, const Derived>
+ reverse(const ReverseDimensions& rev) const {
+ return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+ }
+ template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorPaddingOp<const PaddingDimensions, const Derived>
+ pad(const PaddingDimensions& padding) const {
+ return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding);
+ }
+ template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorShufflingOp<const Shuffle, const Derived>
+ shuffle(const Shuffle& shuffle) const {
+ return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+ }
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingOp<const Strides, const Derived>
+ stride(const Strides& strides) const {
+ return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+ }
+
+ // Force the evaluation of the expression.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorForcedEvalOp<const Derived> eval() const {
+ return TensorForcedEvalOp<const Derived>(derived());
+ }
+
+ protected:
+ template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
+ template <typename Scalar, int Options> friend class TensorVarDim;
+ template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+template<typename Derived>
+class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+ typedef internal::traits<Derived> DerivedTraits;
+ typedef typename DerivedTraits::Scalar Scalar;
+ typedef typename DerivedTraits::Index Index;
+ typedef Scalar CoeffReturnType;
+ typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
+ static const int NumDimensions = DerivedTraits::NumDimensions;
+
+ template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
+ template <typename Scalar, int Options> friend class TensorVarDim;
+ template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setZero() {
+ return setConstant(Scalar(0));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+ return derived() = this->constant(val);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setRandom() {
+ return derived() = this->random();
+ }
+ template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setRandom() {
+ return derived() = this->template random<RandomGenerator>();
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setValues(
+ const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+ TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+ internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+ return derived();
+ }
+#endif // EIGEN_HAS_VARIADIC_TEMPLATES
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator+=(const OtherDerived& other) {
+ return derived() = derived() + other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator-=(const OtherDerived& other) {
+ return derived() = derived() - other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator*=(const OtherDerived& other) {
+ return derived() = derived() * other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator/=(const OtherDerived& other) {
+ return derived() = derived() / other.derived();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorLayoutSwapOp<Derived>
+ swap_layout() const {
+ return TensorLayoutSwapOp<Derived>(derived());
+ }
+ template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReshapingOp<const NewDimensions, Derived>
+ reshape(const NewDimensions& newDimensions) const {
+ return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+ }
+ template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorSlicingOp<const StartIndices, const Sizes, Derived>
+ slice(const StartIndices& startIndices, const Sizes& sizes) const {
+ return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+ }
+ template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorChippingOp<DimId, Derived>
+ chip(const Index offset) const {
+ return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorChippingOp<Dynamic, Derived>
+ chip(const Index offset, const Index dim) const {
+ return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+ }
+ template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorShufflingOp<const Shuffle, Derived>
+ shuffle(const Shuffle& shuffle) const {
+ return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+ }
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorStridingOp<const Strides, Derived>
+ stride(const Strides& strides) const {
+ return TensorStridingOp<const Strides, Derived>(derived(), strides);
+ }
+
+ // Select the device on which to evaluate the expression.
+ template <typename DeviceType>
+ TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+ return TensorDevice<Derived, DeviceType>(device, derived());
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 000000000..5790e19d6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor broadcasting class.
+ *
+ *
+ */
+namespace internal {
+template<typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
+{
+ typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+};
+
+template<typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
+{
+ typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+ : m_xpr(expr), m_broadcast(broadcast) {}
+
+ EIGEN_DEVICE_FUNC
+ const Broadcast& broadcast() const { return m_broadcast; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Broadcast m_broadcast;
+};
+
+
+// Eval as rvalue
+template<typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
+{
+ typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const Broadcast& broadcast = op.broadcast();
+ for (int i = 0; i < NumDims; ++i) {
+ eigen_assert(input_dims[i] > 0);
+ m_dimensions[i] = input_dims[i] * broadcast[i];
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ } else {
+ m_inputStrides[NumDims-1] = 1;
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims-2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ }
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return coeffColMajor(index);
+ } else {
+ return coeffRowMajor(index);
+ }
+ }
+
+ // TODO: attempt to speed this up. The integer divisions and modulo are slow
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+ {
+ Index inputIndex = 0;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+ eigen_assert(index < m_impl.dimensions()[0]);
+ inputIndex += index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+ eigen_assert(index % m_impl.dimensions()[0] == 0);
+ } else {
+ inputIndex += (index % m_impl.dimensions()[0]);
+ }
+ }
+ return m_impl.coeff(inputIndex);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+ {
+ Index inputIndex = 0;
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+ eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+ inputIndex += index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+ eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+ } else {
+ inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+ }
+ }
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
+ {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return packetColMajor<LoadMode>(index);
+ } else {
+ return packetRowMajor<LoadMode>(index);
+ }
+ }
+
+ // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+ // the alignment at compile time.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ const Index originalIndex = index;
+
+ Index inputIndex = 0;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ Index innermostLoc;
+ if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+ eigen_assert(index < m_impl.dimensions()[0]);
+ innermostLoc = index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+ eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
+ innermostLoc = 0;
+ } else {
+ innermostLoc = index % m_impl.dimensions()[0];
+ }
+ }
+ inputIndex += innermostLoc;
+
+ // Todo: this could be extended to the second dimension if we're not
+ // broadcasting alongside the first dimension, and so on.
+ if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ values[0] = m_impl.coeff(inputIndex);
+ for (int i = 1; i < packetSize; ++i) {
+ values[i] = coeffColMajor(originalIndex+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ const Index originalIndex = index;
+
+ Index inputIndex = 0;
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ Index innermostLoc;
+ if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+ eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+ innermostLoc = index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+ eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
+ innermostLoc = 0;
+ } else {
+ innermostLoc = index % m_impl.dimensions()[NumDims-1];
+ }
+ }
+ inputIndex += innermostLoc;
+
+ // Todo: this could be extended to the second dimension if we're not
+ // broadcasting alongside the first dimension, and so on.
+ if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ values[0] = m_impl.coeff(inputIndex);
+ for (int i = 1; i < packetSize; ++i) {
+ values[i] = coeffRowMajor(originalIndex+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 000000000..dc9586cbc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,363 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+ *
+ *
+ */
+
+namespace internal {
+template<DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - 1;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
+{
+ typedef const TensorChippingOp<DimId, XprType>& type;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
+{
+ typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId
+{
+ DimensionId(DenseIndex dim) {
+ eigen_assert(dim == DimId);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+ return DimId;
+ }
+};
+template <>
+struct DimensionId<Dynamic>
+{
+ DimensionId(DenseIndex dim) : actual_dim(dim) {
+ eigen_assert(dim >= 0);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+ return actual_dim;
+ }
+ private:
+ const DenseIndex actual_dim;
+};
+
+
+} // end namespace internal
+
+
+
+template<DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+ : m_xpr(expr), m_offset(offset), m_dim(dim) {
+ }
+
+ EIGEN_DEVICE_FUNC
+ const Index offset() const { return m_offset; }
+ EIGEN_DEVICE_FUNC
+ const Index dim() const { return m_dim.actualDim(); }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
+ {
+ typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
+ Assign assign(*this, other);
+ static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+ internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+ internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Index m_offset;
+ const internal::DimensionId<DimId> m_dim;
+};
+
+
+// Eval as rvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+ typedef TensorChippingOp<DimId, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims-1;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets.
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+ {
+ // We could also support the case where NumInputDims==1 if needed.
+ EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(NumInputDims > m_dim.actualDim());
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ int j = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (i != m_dim.actualDim()) {
+ m_dimensions[j] = input_dims[i];
+ ++j;
+ }
+ }
+
+ m_stride = 1;
+ m_inputStride = 1;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < m_dim.actualDim(); ++i) {
+ m_stride *= input_dims[i];
+ m_inputStride *= input_dims[i];
+ }
+ } else {
+ for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
+ m_stride *= input_dims[i];
+ m_inputStride *= input_dims[i];
+ }
+ }
+ m_inputStride *= input_dims[m_dim.actualDim()];
+ m_inputOffset = m_stride * op.offset();
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(m_stride == 1);
+ Index inputIndex = index * m_inputStride + m_inputOffset;
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = m_impl.coeff(inputIndex);
+ inputIndex += m_inputStride;
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ // m_stride is aways greater than index, so let's avoid the integer division.
+ eigen_assert(m_stride > index);
+ return m_impl.template packet<LoadMode>(index + m_inputOffset);
+ } else {
+ const Index idx = index / m_stride;
+ const Index rem = index - idx * m_stride;
+ if (rem + packetSize <= m_stride) {
+ Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+ return m_impl.template packet<LoadMode>(inputIndex);
+ } else {
+ // Cross the stride boundary. Fallback to slow path.
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index);
+ ++index;
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+ Scalar* result = m_impl.data();
+ if (m_dim.actualDim() == NumDims && result) {
+ return result + m_inputOffset;
+ } else {
+ return NULL;
+ }
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex;
+ if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(m_stride == 1);
+ inputIndex = index * m_inputStride + m_inputOffset;
+ } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ // m_stride is aways greater than index, so let's avoid the integer division.
+ eigen_assert(m_stride > index);
+ inputIndex = index + m_inputOffset;
+ } else {
+ const Index idx = index / m_stride;
+ inputIndex = idx * m_inputStride + m_inputOffset;
+ index -= idx * m_stride;
+ inputIndex += index;
+ }
+ return inputIndex;
+ }
+
+ Dimensions m_dimensions;
+ Index m_stride;
+ Index m_inputOffset;
+ Index m_inputStride;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const internal::DimensionId<DimId> m_dim;
+ const Device& m_device;
+};
+
+
+// Eval as lvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+ : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+ typedef TensorChippingOp<DimId, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims-1;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+ if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(this->m_stride == 1);
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+ for (int i = 0; i < packetSize; ++i) {
+ this->m_impl.coeffRef(inputIndex) = values[i];
+ inputIndex += this->m_inputStride;
+ }
+ } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+ // m_stride is aways greater than index, so let's avoid the integer division.
+ eigen_assert(this->m_stride > index);
+ this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+ } else {
+ const Index idx = index / this->m_stride;
+ const Index rem = index - idx * this->m_stride;
+ if (rem + packetSize <= this->m_stride) {
+ const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+ this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+ } else {
+ // Cross stride boundary. Fallback to slow path.
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ for (int i = 0; i < packetSize; ++i) {
+ this->coeffRef(index) = values[i];
+ ++index;
+ }
+ }
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 000000000..a1dec76d1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor concatenation class.
+ *
+ *
+ */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename promote_storage_type<typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+ static const int Layout = traits<LhsXprType>::Layout;
+ enum { Flags = 0 };
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+ public:
+ typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+ typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
+ typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+ typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+ typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+ typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+ typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+ static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(0 <= m_axis && m_axis < NumDims);
+ const Dimensions& lhs_dims = m_leftImpl.dimensions();
+ const Dimensions& rhs_dims = m_rightImpl.dimensions();
+ int i = 0;
+ for (; i < m_axis; ++i) {
+ eigen_assert(lhs_dims[i] > 0);
+ eigen_assert(lhs_dims[i] == rhs_dims[i]);
+ m_dimensions[i] = lhs_dims[i];
+ }
+ eigen_assert(lhs_dims[i] > 0); // Now i == m_axis.
+ eigen_assert(rhs_dims[i] > 0);
+ m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+ for (++i; i < NumDims; ++i) {
+ eigen_assert(lhs_dims[i] > 0);
+ eigen_assert(lhs_dims[i] == rhs_dims[i]);
+ m_dimensions[i] = lhs_dims[i];
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_leftStrides[0] = 1;
+ m_rightStrides[0] = 1;
+ m_outputStrides[0] = 1;
+
+ for (int i = 1; i < NumDims; ++i) {
+ m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
+ m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ } else {
+ m_leftStrides[NumDims - 1] = 1;
+ m_rightStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims - 1] = 1;
+
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1];
+ m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1];
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+ {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+ {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+ // See CL/76180724 comments for more ideas.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Collect dimension-wise indices (subs).
+ array<Index, NumDims> subs;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ subs[i] = index / m_outputStrides[i];
+ index -= subs[i] * m_outputStrides[i];
+ }
+ subs[0] = index;
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ subs[i] = index / m_outputStrides[i];
+ index -= subs[i] * m_outputStrides[i];
+ }
+ subs[NumDims - 1] = index;
+ }
+
+ const Dimensions& left_dims = m_leftImpl.dimensions();
+ if (subs[m_axis] < left_dims[m_axis]) {
+ Index left_index;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ left_index = subs[0];
+ for (int i = 1; i < NumDims; ++i) {
+ left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+ }
+ } else {
+ left_index = subs[NumDims - 1];
+ for (int i = NumDims - 2; i >= 0; --i) {
+ left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+ }
+ }
+ return m_leftImpl.coeff(left_index);
+ } else {
+ subs[m_axis] -= left_dims[m_axis];
+ const Dimensions& right_dims = m_rightImpl.dimensions();
+ Index right_index;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ right_index = subs[0];
+ for (int i = 1; i < NumDims; ++i) {
+ right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+ }
+ } else {
+ right_index = subs[NumDims - 1];
+ for (int i = NumDims - 2; i >= 0; --i) {
+ right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+ }
+ }
+ return m_rightImpl.coeff(right_index);
+ }
+ }
+
+ // TODO(phli): Add a real vectorization.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_leftStrides;
+ array<Index, NumDims> m_rightStrides;
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+ const Axis m_axis;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 000000000..f7254a24d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,992 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor contraction class.
+ *
+ *
+ */
+namespace internal {
+
+enum {
+ Rhs = 0,
+ Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ size_t packet_size, bool inner_dim_contiguous>
+class BaseTensorContractionMapper {
+ public:
+ EIGEN_DEVICE_FUNC
+ BaseTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ m_tensor(tensor),
+ m_nocontract_strides(nocontract_strides),
+ m_ij_strides(ij_strides),
+ m_contract_strides(contract_strides),
+ m_k_strides(k_strides) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+ // column major assumption
+ return operator()(row, 0);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+ return m_tensor.coeff(computeIndex(row, col));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+ const bool left = (side == Lhs);
+ Index nocontract_val = left ? row : col;
+ Index linidx = 0;
+ for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+ const Index idx = nocontract_val / m_ij_strides[i];
+ linidx += idx * m_nocontract_strides[i];
+ nocontract_val -= idx * m_ij_strides[i];
+ }
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx += nocontract_val;
+ } else {
+ linidx += nocontract_val * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val = left ? col : row;
+ for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+ EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
+
+ return linidx;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+ const bool left = (side == Lhs);
+ Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+ Index linidx[2] = {0, 0};
+ for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx[0] += nocontract_val[0];
+ linidx[1] += nocontract_val[1];
+ } else {
+ linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+ linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+ for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
+ EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
+ return IndexPair<Index>(linidx[0], linidx[1]);
+ }
+
+ Index firstAligned(Index size) const {
+ return size;
+ }
+ Index stride() const {
+ return 1;
+ }
+
+ protected:
+ const Tensor m_tensor;
+ const nocontract_t m_nocontract_strides;
+ const nocontract_t m_ij_strides;
+ const contract_t m_contract_strides;
+ const contract_t m_k_strides;
+};
+
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ size_t packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ size_t packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+ typedef Self LinearMapper;
+
+ EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+ : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+ return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+ return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+ return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+ return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+ return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+ m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+ return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT, int AlignmentType>
+ EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+ EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return loadPacket(i);
+ }
+
+ template <typename Packet>
+ bool aligned(Index /*i*/) const {
+ return false;
+ }
+
+ private:
+ const ParentMapper& m_base_mapper;
+ const Index m_vert_offset;
+ const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
+ bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
+class TensorContractionInputMapper
+ : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
+
+ public:
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+ typedef SubMapper VectorMapper;
+
+ TensorContractionInputMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides)
+ : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+ return SubMapper(*this, i, j);
+ }
+
+ EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+ return VectorMapper(*this, i, j);
+ }
+
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+ // whole method makes column major assumption
+
+ // don't need to add offsets for now (because operator handles that)
+ // current code assumes packet size must be a multiple of 2
+ EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+ const Index index = this->computeIndex(i, j);
+ eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+ return this->m_tensor.template packet<Alignment>(index);
+ }
+
+ const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+ const Index first = indexPair.first;
+ const Index last = indexPair.second;
+
+ // We can always do optimized packet reads from left hand side right now, because
+ // the vertical matrix dimension on the left hand side is never contracting.
+ // On the right hand side we need to check if the contracting dimensions may have
+ // been shuffled first.
+ if (Tensor::PacketAccess &&
+ (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+ (last - first) == (packet_size - 1)) {
+
+ return this->m_tensor.template packet<Alignment>(first);
+ }
+
+ EIGEN_ALIGN_DEFAULT Scalar data[packet_size];
+
+ data[0] = this->m_tensor.coeff(first);
+ for (Index k = 1; k < packet_size - 1; k += 2) {
+ const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+ data[k] = this->m_tensor.coeff(internal_pair.first);
+ data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+ }
+ data[packet_size - 1] = this->m_tensor.coeff(last);
+
+ return pload<Packet>(data);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+ // whole method makes column major assumption
+
+ // don't need to add offsets for now (because operator handles that)
+ const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+ if (half_packet_size == packet_size) {
+ return loadPacket(i, j);
+ }
+ EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size];
+ for (Index k = 0; k < half_packet_size; k++) {
+ data[k] = operator()(i + k, j);
+ }
+ return pload<HalfPacket>(data);
+ }
+};
+
+
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> {
+
+ public:
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+ typedef SubMapper VectorMapper;
+
+ TensorContractionInputMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides)
+ : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+ return SubMapper(*this, i, j);
+ }
+
+ EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+ return VectorMapper(*this, i, j);
+ }
+
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+ EIGEN_ALIGN_DEFAULT Scalar data[1];
+ data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+ return pload<typename packet_traits<Scalar>::type>(data);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+ return loadPacket(i, j);
+ }
+};
+
+
+template <size_t n> struct max_n_1 {
+ static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+ static const size_t size = 1;
+};
+
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+ // From NumDims below.
+ static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+ static const int Layout = traits<LhsXprType>::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+};
+
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+ typedef Indices_ Indices;
+ typedef LeftArgType_ LeftArgType;
+ typedef RightArgType_ RightArgType;
+ typedef Device_ Device;
+
+ // From NumDims below.
+ static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+};
+
+} // end namespace internal
+
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet;
+ typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+ typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+ const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const Indices& indices() const { return m_indices; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const Indices m_indices;
+};
+
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+ return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+ return second;
+}
+
+
+template<typename Derived>
+struct TensorContractionEvaluatorBase
+{
+ typedef typename internal::traits<Derived>::Indices Indices;
+ typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+ typedef typename internal::traits<Derived>::RightArgType RightArgType;
+ typedef typename internal::traits<Derived>::Device Device;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Packet Packet;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+ static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+ typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+ : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+ op.lhsExpression(), op.rhsExpression()), device),
+ m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+ op.rhsExpression(), op.lhsExpression()), device),
+ m_device(device),
+ m_result(NULL) {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
+
+
+ DSizes<Index, LDims> eval_left_dims;
+ DSizes<Index, RDims> eval_right_dims;
+ array<IndexPair<Index>, ContractDims> eval_op_indices;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ // For ColMajor, we keep using the existing dimensions
+ for (int i = 0; i < LDims; i++) {
+ eval_left_dims[i] = m_leftImpl.dimensions()[i];
+ }
+ for (int i = 0; i < RDims; i++) {
+ eval_right_dims[i] = m_rightImpl.dimensions()[i];
+ }
+ // We keep the pairs of contracting indices.
+ for (int i = 0; i < ContractDims; i++) {
+ eval_op_indices[i].first = op.indices()[i].first;
+ eval_op_indices[i].second = op.indices()[i].second;
+ }
+ } else {
+ // For RowMajor, we need to reverse the existing dimensions
+ for (int i = 0; i < LDims; i++) {
+ eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+ }
+ for (int i = 0; i < RDims; i++) {
+ eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+ }
+ // We need to flip all the pairs of contracting indices as well as
+ // reversing the dimensions.
+ for (int i = 0; i < ContractDims; i++) {
+ eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
+ eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+ }
+ }
+
+ array<Index, LDims> lhs_strides;
+ lhs_strides[0] = 1;
+ for (int i = 0; i < LDims-1; ++i) {
+ lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
+ }
+
+ array<Index, RDims> rhs_strides;
+ rhs_strides[0] = 1;
+ for (int i = 0; i < RDims-1; ++i) {
+ rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
+ }
+
+ m_i_strides[0] = 1;
+ m_j_strides[0] = 1;
+ m_k_strides[0] = 1;
+
+ m_i_size = 1;
+ m_j_size = 1;
+ m_k_size = 1;
+
+ // To compute the dimension, we simply concatenate the non-contracting
+ // dimensions of the left and then the right tensor. Additionally, we also
+ // compute the strides corresponding to the left non-contracting
+ // dimensions and right non-contracting dimensions.
+ m_lhs_inner_dim_contiguous = true;
+ int dim_idx = 0;
+ int nocontract_idx = 0;
+
+ for (int i = 0; i < LDims; i++) {
+ // find if we are contracting on index i of left tensor
+ bool contracting = false;
+ for (int j = 0; j < ContractDims; j++) {
+ if (eval_op_indices[j].first == i) {
+ contracting = true;
+ break;
+ }
+ }
+ if (!contracting) {
+ // add dimension size to output dimensions
+ m_dimensions[dim_idx] = eval_left_dims[i];
+ m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+ if (dim_idx != i) {
+ m_lhs_inner_dim_contiguous = false;
+ }
+ if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+ m_i_strides[nocontract_idx+1] =
+ m_i_strides[nocontract_idx] * eval_left_dims[i];
+ } else {
+ m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+ }
+ dim_idx++;
+ nocontract_idx++;
+ }
+ }
+
+ nocontract_idx = 0;
+ for (int i = 0; i < RDims; i++) {
+ bool contracting = false;
+ // find if we are contracting on index i of right tensor
+ for (int j = 0; j < ContractDims; j++) {
+ if (eval_op_indices[j].second == i) {
+ contracting = true;
+ break;
+ }
+ }
+ if (!contracting) {
+ m_dimensions[dim_idx] = eval_right_dims[i];
+ if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+ m_j_strides[nocontract_idx+1] =
+ m_j_strides[nocontract_idx] * eval_right_dims[i];
+ } else {
+ m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+ }
+ m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+ dim_idx++;
+ nocontract_idx++;
+ }
+ }
+
+ // Now compute the strides corresponding to the contracting dimensions. We
+ // assumed above that non-contracting axes are represented in the same order
+ // in the matrix as they are in the tensor. This is not the case for
+ // contracting axes. As the contracting axes must be of the same size in
+ // each tensor, we'll only look at the first tensor here.
+ m_rhs_inner_dim_contiguous = true;
+ m_rhs_inner_dim_reordered = false;
+ for (int i = 0; i < ContractDims; i++) {
+ Index left = eval_op_indices[i].first;
+ Index right = eval_op_indices[i].second;
+
+ Index size = eval_left_dims[left];
+ eigen_assert(size == eval_right_dims[right] &&
+ "Contraction axes must be same size");
+
+ if (i+1 < internal::array_size<contract_t>::value) {
+ m_k_strides[i+1] = m_k_strides[i] * size;
+ } else {
+ m_k_size = m_k_strides[i] * size;
+ }
+ m_left_contracting_strides[i] = lhs_strides[left];
+ m_right_contracting_strides[i] = rhs_strides[right];
+
+ if (i > 0 && right < eval_op_indices[i-1].second) {
+ m_rhs_inner_dim_reordered = true;
+ }
+ if (right != i) {
+ m_rhs_inner_dim_contiguous = false;
+ }
+ }
+
+ // Scalar case. We represent the result as a 1d tensor of size 1.
+ if (LDims + RDims == 2 * ContractDims) {
+ m_dimensions[0] = 1;
+ }
+
+ // If the layout is RowMajor, we need to reverse the m_dimensions
+ if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+ for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+ std::swap(m_dimensions[i], m_dimensions[j]);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+ evalTo(m_result);
+ return true;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+ if (this->m_lhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+ }
+ else {
+ static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+ }
+ else {
+ static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+ }
+ else {
+ static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+ }
+ else {
+ static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalGemv(Scalar* buffer) const {
+ const Index rows = m_i_size;
+ const Index cols = m_k_size;
+
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+ const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+ const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+ m_left_contracting_strides, m_k_strides);
+ RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+ m_right_contracting_strides, m_k_strides);
+
+ const Scalar alpha(1);
+ const Index resIncr(1);
+
+ // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+ m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+ internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+ rows, cols, lhs, rhs,
+ buffer, resIncr, alpha);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+
+ if (m_result != NULL) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_result[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<Packet, LoadMode>(m_result + index);
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ // Prevent assignment
+ TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+ Dimensions m_dimensions;
+
+ contract_t m_k_strides;
+ contract_t m_left_contracting_strides;
+ contract_t m_right_contracting_strides;
+
+ bool m_lhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_reordered;
+
+ left_nocontract_t m_i_strides;
+ right_nocontract_t m_j_strides;
+ left_nocontract_t m_left_nocontract_strides;
+ right_nocontract_t m_right_nocontract_strides;
+
+ Index m_i_size;
+ Index m_j_size;
+ Index m_k_size;
+
+ TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+ TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+ const Device& m_device;
+ Scalar* m_result;
+};
+
+
+// evaluator for default device
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+ public TensorContractionEvaluatorBase<
+ TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Packet Packet;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+ typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+ static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+ // Could we use NumDimensions here?
+ typedef DSizes<Index, NumDims> Dimensions;
+
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) { }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ if (this->m_j_size == 1) {
+ this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+ return;
+ }
+
+ evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+ // define mr, nr, and all of my data mapper types
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ const Index nr = Traits::nr;
+ const Index mr = Traits::mr;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+ const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+ // Declare GEBP packing and kernel structs
+ internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+ internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+ internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
+
+ // Sizes of the blocks to load in cache. See the Goto paper for details.
+ BlockingType blocking(m, n, k, 1, true);
+ const Index kc = blocking.kc();
+ const Index mc = (std::min)(m, blocking.mc());
+ const Index nc = (std::min)(n, blocking.nc());
+ const Index sizeA = mc * kc;
+ const Index sizeB = kc * nc;
+
+ LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+ RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+ for(Index i2=0; i2<m; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,m)-i2;
+ for (Index k2 = 0; k2 < k; k2 += kc) {
+ // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+ const Index actual_kc = (std::min)(k2 + kc, k) - k2;
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+ // series of horizontal blocks
+ for (Index j2 = 0; j2 < n; j2 += nc) {
+ // make sure we don't overshoot right edge of right matrix, then pack block
+ const Index actual_nc = (std::min)(j2 + nc, n) - j2;
+ pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+ // call gebp (matrix kernel)
+ // The parameters here are copied from Eigen's GEMM implementation
+ gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+ }
+ }
+ }
+
+ this->m_device.deallocate(blockA);
+ this->m_device.deallocate(blockB);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 000000000..588770bb4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,1384 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
+ const Index m_size, const Index n_size, const Index k_size) {
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ // declare and initialize 64 registers for output 8x8 block
+
+ // prefetch registers
+ Scalar lhs_pf0;
+ Scalar lhs_pf1;
+ Scalar lhs_pf2;
+ Scalar lhs_pf3;
+ Scalar lhs_pf4;
+ Scalar lhs_pf5;
+ Scalar lhs_pf6;
+ Scalar lhs_pf7;
+
+ Scalar rhs_pf0;
+ Scalar rhs_pf1;
+ Scalar rhs_pf2;
+ Scalar rhs_pf3;
+ Scalar rhs_pf4;
+ Scalar rhs_pf5;
+ Scalar rhs_pf6;
+ Scalar rhs_pf7;
+
+ // shared memory is formatted
+ // (contract idx in block, nocontract idx in block, block idx)
+ // where block idx is column major. This transposition limits the number of
+ // bank conflicts when reading the LHS. The core idea is that since the contracting
+ // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+ // On the LHS, we pad each row inside of each block with an extra element. This makes
+ // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+ // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+ // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+ // conflicts on writes and also none on reads.
+
+ // storage indices
+ const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+ const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+ const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+ const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+ const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+ const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+ const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+ const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+ const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+ const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+ const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+ const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+ const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+ const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+ const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+ const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+ const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+ const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+ // in the loading code, the following variables are important:
+ // threadIdx.x: the vertical position in an 8x8 block
+ // threadIdx.y: the vertical index of the 8x8 block in the grid
+ // threadIdx.z: the horizontal position in an 8x8 block
+ // k: the horizontal index of the 8x8 block in the grid
+ //
+ // The k parameter is implicit (it was the loop counter for a loop that went
+ // from 0 to <8, but now that loop is unrolled in the below code.
+
+ const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+ const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k) \
+ { \
+ lhs_pf0 = Scalar(0); \
+ lhs_pf1 = Scalar(0); \
+ lhs_pf2 = Scalar(0); \
+ lhs_pf3 = Scalar(0); \
+ lhs_pf4 = Scalar(0); \
+ lhs_pf5 = Scalar(0); \
+ lhs_pf6 = Scalar(0); \
+ lhs_pf7 = Scalar(0); \
+ \
+ rhs_pf0 = Scalar(0); \
+ rhs_pf1 = Scalar(0); \
+ rhs_pf2 = Scalar(0); \
+ rhs_pf3 = Scalar(0); \
+ rhs_pf4 = Scalar(0); \
+ rhs_pf5 = Scalar(0); \
+ rhs_pf6 = Scalar(0); \
+ rhs_pf7 = Scalar(0); \
+ \
+ if (!needs_edge_check || lhs_vert < m_size) { \
+ const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
+ const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
+ const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
+ const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
+ const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
+ const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
+ const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
+ const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
+ \
+ if (!needs_edge_check || lhs_horiz_7 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
+ } else if (lhs_horiz_6 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ } else if (lhs_horiz_5 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ } else if (lhs_horiz_4 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ } else if (lhs_horiz_3 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ } else if (lhs_horiz_2 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ } else if (lhs_horiz_1 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ } else if (lhs_horiz_0 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ } \
+ } \
+ \
+ const Index rhs_vert = base_k + load_idx_vert; \
+ if (!needs_edge_check || rhs_vert < k_size) { \
+ const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
+ const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
+ const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
+ const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
+ const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
+ const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
+ const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
+ const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
+ \
+ if (rhs_horiz_7 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
+ } else if (rhs_horiz_6 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ } else if (rhs_horiz_5 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ } else if (rhs_horiz_4 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ } else if (rhs_horiz_3 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ } else if (rhs_horiz_2 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ } else if (rhs_horiz_1 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ } else if (rhs_horiz_0 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ } \
+ } \
+ } \
+
+#define writeRegToShmem(_) \
+ lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
+ rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
+ \
+ lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
+ rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
+ \
+ lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
+ rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
+ \
+ lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
+ rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
+ \
+ lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
+ rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
+ \
+ lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
+ rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
+ \
+ lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
+ rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
+ \
+ lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
+ rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
+
+ // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i) \
+ Scalar res(i, 0) = Scalar(0); \
+ Scalar res(i, 1) = Scalar(0); \
+ Scalar res(i, 2) = Scalar(0); \
+ Scalar res(i, 3) = Scalar(0); \
+ Scalar res(i, 4) = Scalar(0); \
+ Scalar res(i, 5) = Scalar(0); \
+ Scalar res(i, 6) = Scalar(0); \
+ Scalar res(i, 7) = Scalar(0); \
+
+ initResultRow(0);
+ initResultRow(1);
+ initResultRow(2);
+ initResultRow(3);
+ initResultRow(4);
+ initResultRow(5);
+ initResultRow(6);
+ initResultRow(7);
+#undef initResultRow
+
+ for (Index base_k = 0; base_k < k_size; base_k += 64) {
+ // wait for previous iteration to finish with shmem. Despite common sense,
+ // the code is a bit faster with this here then at bottom of loop
+ __syncthreads();
+
+ prefetchIntoRegisters(base_k);
+ writeRegToShmem();
+
+ #undef prefetchIntoRegisters
+ #undef writeRegToShmem
+
+ // wait for shared mem packing to be done before starting computation
+ __syncthreads();
+
+ // compute 8x8 matrix product by outer product. This involves packing one column
+ // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+ Scalar lcol(0);
+ Scalar lcol(1);
+ Scalar lcol(2);
+ Scalar lcol(3);
+ Scalar lcol(4);
+ Scalar lcol(5);
+ Scalar lcol(6);
+ Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+ Scalar rrow(0);
+ Scalar rrow(1);
+ Scalar rrow(2);
+ Scalar rrow(3);
+ Scalar rrow(4);
+ Scalar rrow(5);
+ Scalar rrow(6);
+ Scalar rrow(7);
+
+ // Now x corresponds to k, y to m, and z to n
+ const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+ const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j) \
+ lcol(0) = lhs_element(0, j); \
+ rrow(0) = rhs_element(i, 0); \
+ lcol(1) = lhs_element(1, j); \
+ rrow(1) = rhs_element(i, 1); \
+ lcol(2) = lhs_element(2, j); \
+ rrow(2) = rhs_element(i, 2); \
+ lcol(3) = lhs_element(3, j); \
+ rrow(3) = rhs_element(i, 3); \
+ lcol(4) = lhs_element(4, j); \
+ rrow(4) = rhs_element(i, 4); \
+ lcol(5) = lhs_element(5, j); \
+ rrow(5) = rhs_element(i, 5); \
+ lcol(6) = lhs_element(6, j); \
+ rrow(6) = rhs_element(i, 6); \
+ lcol(7) = lhs_element(7, j); \
+ rrow(7) = rhs_element(i, 7); \
+
+#define computeCol(j) \
+ res(0, j) += lcol(0) * rrow(j); \
+ res(1, j) += lcol(1) * rrow(j); \
+ res(2, j) += lcol(2) * rrow(j); \
+ res(3, j) += lcol(3) * rrow(j); \
+ res(4, j) += lcol(4) * rrow(j); \
+ res(5, j) += lcol(5) * rrow(j); \
+ res(6, j) += lcol(6) * rrow(j); \
+ res(7, j) += lcol(7) * rrow(j); \
+
+#define computePass(i) \
+ loadData(i, i); \
+ \
+ computeCol(0); \
+ computeCol(1); \
+ computeCol(2); \
+ computeCol(3); \
+ computeCol(4); \
+ computeCol(5); \
+ computeCol(6); \
+ computeCol(7); \
+
+ computePass(0);
+ computePass(1);
+ computePass(2);
+ computePass(3);
+ computePass(4);
+ computePass(5);
+ computePass(6);
+ computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+ } // end loop over k
+
+ // we've now iterated over all of the large (ie width 64) k blocks and
+ // accumulated results in registers. At this point thread (x, y, z) contains
+ // the sum across all big k blocks of the product of little k block of index (x, y)
+ // with block of index (y, z). To compute the final output, we need to reduce
+ // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask) \
+ shuffleInc(i, 0, mask); \
+ shuffleInc(i, 1, mask); \
+ shuffleInc(i, 2, mask); \
+ shuffleInc(i, 3, mask); \
+ shuffleInc(i, 4, mask); \
+ shuffleInc(i, 5, mask); \
+ shuffleInc(i, 6, mask); \
+ shuffleInc(i, 7, mask); \
+
+#define reduceMatrix(mask) \
+ reduceRow(0, mask); \
+ reduceRow(1, mask); \
+ reduceRow(2, mask); \
+ reduceRow(3, mask); \
+ reduceRow(4, mask); \
+ reduceRow(5, mask); \
+ reduceRow(6, mask); \
+ reduceRow(7, mask); \
+
+ // actually perform the reduction, now each thread of index (_, y, z)
+ // contains the correct values in its registers that belong in the output
+ // block
+ reduceMatrix(1);
+ reduceMatrix(2);
+ reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+ // now we need to copy the 64 values into main memory. We can't split work
+ // among threads because all variables are in registers. There's 2 ways
+ // to do this:
+ // (1) have 1 thread do 64 writes from registers into global memory
+ // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+ // each do 8 writes into global memory. We can just overwrite the shared
+ // memory from the problem we just solved.
+ // (2) is slightly faster than (1) due to less branching and more ILP
+
+ // TODO: won't yield much gain, but could just use currently unused shared mem
+ // and then we won't have to sync
+ // wait for shared mem to be out of use
+ __syncthreads();
+
+#define writeResultShmem(i, j) \
+ lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i) \
+ writeResultShmem(i, 0); \
+ writeResultShmem(i, 1); \
+ writeResultShmem(i, 2); \
+ writeResultShmem(i, 3); \
+ writeResultShmem(i, 4); \
+ writeResultShmem(i, 5); \
+ writeResultShmem(i, 6); \
+ writeResultShmem(i, 7); \
+
+ if (threadIdx.x == 0) {
+ writeRow(0);
+ writeRow(1);
+ writeRow(2);
+ writeRow(3);
+ writeRow(4);
+ writeRow(5);
+ writeRow(6);
+ writeRow(7);
+ }
+#undef writeResultShmem
+#undef writeRow
+
+ const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+ const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+ if (threadIdx.x < max_i_write) {
+ if (max_j_write == 8) {
+ // TODO: can i trade bank conflicts for coalesced writes?
+ Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+ Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+ Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+ Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+ Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+ Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+ Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+ Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+ } else {
+#pragma unroll 7
+ for (int j = 0; j < max_j_write; j++) {
+ Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+ }
+ }
+ }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ volatile Scalar lhs_shmem[72 * 64];
+ __shared__ volatile Scalar rhs_shmem[72 * 64];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size && base_n + 63 < n_size) {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ } else {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][16],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+ typedef float Scalar;
+
+ // prefetch registers
+ float4 lhs_pf0, rhs_pf0;
+
+ float4 results[4];
+ for (int i=0; i < 4; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.loadPacket(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.loadPacket(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
+
+
+ Index lhs_vert = base_m+threadIdx.x*4;
+
+ for (Index k = 0; k < k_size; k += 16) {
+ lhs_pf0 = internal::pset1<float4>(0);
+ rhs_pf0 = internal::pset1<float4>(0);
+
+ Index lhs_horiz = threadIdx.y+k;
+ prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+ Index rhs_vert = k+(threadIdx.x%4)*4;
+ Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ } else {
+ if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ float x1, x2 ;
+ // the following can be a bitwise operation..... some day.
+ if((threadIdx.x%8) < 4) {
+ x1 = rhs_pf0.y;
+ x2 = rhs_pf0.w;
+ } else {
+ x1 = rhs_pf0.x;
+ x2 = rhs_pf0.z;
+ }
+ x1 = __shfl_xor(x1, 4);
+ x2 = __shfl_xor(x2, 4);
+ if((threadIdx.x%8) < 4) {
+ rhs_pf0.y = x1;
+ rhs_pf0.w = x2;
+ } else {
+ rhs_pf0.x = x1;
+ rhs_pf0.z = x2;
+ }
+
+ // We have 64 features.
+ // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+ // ...
+ // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+ // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+ // ...
+ rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+ rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // ...
+ // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
+ // ...
+
+ lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+ results[0].x += fl1.x * fr1.x;\
+ results[0].y += fl1.y * fr1.x;\
+ results[0].z += fl2.x * fr1.x;\
+ results[0].w += fl2.y * fr1.x;\
+\
+ results[1].x += fl1.x * fr1.y;\
+ results[1].y += fl1.y * fr1.y;\
+ results[1].z += fl2.x * fr1.y;\
+ results[1].w += fl2.y * fr1.y;\
+\
+ results[2].x += fl1.x * fr2.x;\
+ results[2].y += fl1.y * fr2.x;\
+ results[2].z += fl2.x * fr2.x;\
+ results[2].w += fl2.y * fr2.x;\
+\
+ results[3].x += fl1.x * fr2.y;\
+ results[3].y += fl1.y * fr2.y;\
+ results[3].z += fl2.x * fr2.y;\
+ results[3].w += fl2.y * fr2.y;\
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 16; koff ++) {
+ // 32 x threads.
+ float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+ float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+ int start_feature = threadIdx.y * 4;
+ float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+ float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+ add_vals(fl1, fl2, fr1, fr2)
+ }
+ __syncthreads();
+ }
+
+#undef prefetch_lhs
+#undef add_vals
+
+ Index horiz_base = threadIdx.y*4+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ // CHECK LHS
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK RHS
+ /*
+ int ncols_rem = fminf(n_size- horiz_base, 4);
+ for (int i = 0; i < ncols_rem; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }*/
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][32],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+ typedef float Scalar;
+
+ // prefetch registers
+ float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+ float4 rhs_pf0, rhs_pf1;
+
+ float4 results[8];
+ for (int i=0; i < 8; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+
+ Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+ for (Index k = 0; k < k_size; k += 32) {
+ lhs_pf0 = internal::pset1<float4>(0);
+ lhs_pf1 = internal::pset1<float4>(0);
+ lhs_pf2 = internal::pset1<float4>(0);
+ lhs_pf3 = internal::pset1<float4>(0);
+
+ rhs_pf0 = internal::pset1<float4>(0);
+ rhs_pf1 = internal::pset1<float4>(0);
+
+ if (!CHECK_LHS_BOUNDARY) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ }
+ } else {
+ // just CHECK_LHS_BOUNDARY
+ if (lhs_vert + 3 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+ lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ }
+ }
+ }
+ __syncthreads();
+ Index rhs_vert = k+threadIdx.x*4;
+ Index rhs_horiz0 = threadIdx.y*2+base_n;
+ Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else {
+ if (rhs_horiz1 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (k+threadIdx.x*4 + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (k+threadIdx.x*4 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ __syncthreads();
+ // Loaded. Do computation
+ // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+ // ..
+ // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+ rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+ // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+ // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+ // ..
+ rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+ // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+ // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+ rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+ // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+ // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+ rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+ // LHS.
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // ...
+ // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+ // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+ results[0].x += a_feat1.x * f1.x;\
+ results[1].x += a_feat1.x * f1.y;\
+ results[2].x += a_feat1.x * f2.x;\
+ results[3].x += a_feat1.x * f2.y;\
+ results[4].x += a_feat1.x * f3.x;\
+ results[5].x += a_feat1.x * f3.y;\
+ results[6].x += a_feat1.x * f4.x;\
+ results[7].x += a_feat1.x * f4.y;\
+\
+ results[0].y += a_feat1.y * f1.x;\
+ results[1].y += a_feat1.y * f1.y;\
+ results[2].y += a_feat1.y * f2.x;\
+ results[3].y += a_feat1.y * f2.y;\
+ results[4].y += a_feat1.y * f3.x;\
+ results[5].y += a_feat1.y * f3.y;\
+ results[6].y += a_feat1.y * f4.x;\
+ results[7].y += a_feat1.y * f4.y;\
+\
+ results[0].z += a_feat2.x * f1.x;\
+ results[1].z += a_feat2.x * f1.y;\
+ results[2].z += a_feat2.x * f2.x;\
+ results[3].z += a_feat2.x * f2.y;\
+ results[4].z += a_feat2.x * f3.x;\
+ results[5].z += a_feat2.x * f3.y;\
+ results[6].z += a_feat2.x * f4.x;\
+ results[7].z += a_feat2.x * f4.y;\
+\
+ results[0].w += a_feat2.y * f1.x;\
+ results[1].w += a_feat2.y * f1.y;\
+ results[2].w += a_feat2.y * f2.x;\
+ results[3].w += a_feat2.y * f2.y;\
+ results[4].w += a_feat2.y * f3.x;\
+ results[5].w += a_feat2.y * f3.y;\
+ results[6].w += a_feat2.y * f4.x;\
+ results[7].w += a_feat2.y * f4.y;\
+
+ lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+ lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+ lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+ lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+ lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+ lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+ lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 32; koff ++) {
+ float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+ float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+ // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+ int start_feature = (threadIdx.y / 4) * 8;
+
+ float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
+ float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+ float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+ float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+ add_vals(a3, a4, br1, br2, br3, br4)
+ }
+ __syncthreads();
+ } // end loop over k
+
+
+ __syncthreads();
+ Index horiz_base = (threadIdx.y/4)*8+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK BOUNDARY_B
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[64*32];
+ __shared__ float2 rhs_shmem[128*8];
+
+ typedef float2 LHS_MEM[64][32];
+ typedef float2 RHS_MEM[128][8];
+
+ typedef float2 LHS_MEM16x16[32][16];
+ typedef float2 RHS_MEM16x16[64][8];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 128 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ bool check_rhs = (base_n + 63) >= n_size;
+ bool check_lhs128 = (base_m + 127) >= m_size;
+ bool check_lhs64 = (base_m + 63) >= m_size;
+
+ if (!check_rhs) {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[32][16];
+ __shared__ float2 rhs_shmem[64][8];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size) {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+ public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+ typedef GpuDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Packet Packet;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+ typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+ static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ // typedefs needed in evalTo
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ typedef typename LeftEvaluator::Dimensions LeftDimensions;
+ typedef typename RightEvaluator::Dimensions RightDimensions;
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) {}
+
+ // We need to redefine this method to make nvcc happy
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+ this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+ evalTo(this->m_result);
+ return true;
+ }
+ }
+
+ void evalTo(Scalar* buffer) const {
+ if (this->m_lhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalTyped(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, 4,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, 4,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
+ if (internal::is_same<LhsScalar, float>::value &&
+ internal::is_same<RhsScalar, float>::value) {
+ if (m < 768 || n < 768) {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(16, 16, 1);
+ LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+ } else {
+ const Index m_blocks = (m + 127) / 128;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 32, 1);
+ LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+ }
+ } else {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 8, 8);
+ LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+ }
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 000000000..8b87f1045
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+namespace Eigen {
+namespace internal {
+
+template<typename LhsScalar, typename LhsMapper, typename Index>
+struct packLhsArg {
+ LhsScalar* blockA;
+ const LhsMapper& lhs;
+ const Index m_start;
+ const Index k_start;
+ const Index mc;
+ const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
+struct packRhsAndKernelArg {
+ const std::vector<LhsScalar*>* blockAs;
+ RhsScalar* blockB;
+ const RhsMapper& rhs;
+ OutputMapper& output;
+ const Index m;
+ const Index k;
+ const Index n;
+ const Index mc;
+ const Index kc;
+ const Index nc;
+ const Index num_threads;
+ const Index num_blockAs;
+ const Index max_m;
+ const Index k_block_idx;
+ const Index m_block_idx;
+ const Index n_block_idx;
+ const Index m_blocks;
+ const Index n_blocks;
+ std::vector<Promise>* kernel_promises;
+ const std::vector<Future>* lhs_futures;
+ const bool need_to_pack;
+};
+
+} // end namespace internal
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
+ public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+
+ typedef ThreadPoolDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Packet Packet;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+ typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+ static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ // typedefs needed in evalTo
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) {}
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ if (this->m_j_size == 1) {
+ this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+ return;
+ }
+
+ evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalGemm(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+
+ const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+ const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+ // TODO: packing could be faster sometimes if we supported row major tensor mappers
+ typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+ Traits::LhsProgress, ColMajor> LhsPacker;
+ typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+ // TODO: replace false, false with conjugate values?
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+ Traits::mr, Traits::nr, false, false> GebpKernel;
+
+ typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
+ typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ LhsPacker pack_lhs;
+
+ // compute block sizes (which depend on number of threads)
+ const Index num_threads = this->m_device.numThreads();
+ Index mc = m;
+ Index nc = n;
+ Index kc = k;
+ internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
+ eigen_assert(mc <= m);
+ eigen_assert(nc <= n);
+ eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+ const Index k_blocks = CEIL_DIV(k, kc);
+ const Index n_blocks = CEIL_DIV(n, nc);
+ const Index m_blocks = CEIL_DIV(m, mc);
+ const int sizeA = mc * kc;
+ const int sizeB = kc * nc;
+
+ /* cout << "m: " << m << " n: " << n << " k: " << k << endl;
+ cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
+ cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
+ cout << "num threads: " << num_threads << endl;
+ */
+
+ // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+ // aren't 16 byte aligned segfaults will happen due to SIMD instructions
+ // note: You can get away with allocating just a single blockA and offsets and meet the
+ // the alignment requirements with the assumption that
+ // (Traits::mr * sizeof(ResScalar)) % 16 == 0
+ const Index numBlockAs = (std::min)(num_threads, m_blocks);
+ std::vector<LhsScalar *> blockAs;
+ blockAs.reserve(num_threads);
+ for (int i = 0; i < num_threads; i++) {
+ blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+ }
+
+ // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
+ // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
+ // Other options: (1) reuse memory when a thread finishes. con: tricky
+ // (2) allocate block B memory in each thread. con: overhead
+ std::vector<RhsScalar *> blockBs;
+ blockBs.reserve(n_blocks);
+ for (int i = 0; i < n_blocks; i++) {
+ blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+ }
+
+ // lhs_futures starts with all null futures
+ std::vector<Future> lhs_futures(num_threads);
+
+ // this should really be numBlockAs * n_blocks;
+ const Index num_kernel_promises = num_threads * n_blocks;
+ std::vector<Promise> kernel_promises(num_kernel_promises);
+ std::vector<Future> kernel_futures(num_kernel_promises);
+ for (int i = 0; i < kernel_promises.size(); ++i) {
+ kernel_promises[i].set_value();
+ kernel_futures[i] = kernel_promises[i].get_future();
+ }
+
+ for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+ const Index k_start = k_block_idx * kc;
+ // make sure we don't overshoot right edge of left matrix
+ const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+
+ for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
+ const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+
+ for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
+ const Index m_start = mt_block_idx * mc;
+ const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+ eigen_assert(actual_mc > 0);
+
+ int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+ for (int i = 0; i < n_blocks; ++i) {
+ int future_id = (blockAId * n_blocks + i);
+ wait_until_ready(&kernel_futures[future_id]);
+ kernel_promises[future_id] = Promise();
+ kernel_futures[future_id] = kernel_promises[future_id].get_future();
+ }
+ const packLArg arg = {
+ blockAs[blockAId], // blockA
+ lhs, // lhs
+ m_start, // m
+ k_start, // k
+ actual_mc, // mc
+ actual_kc, // kc
+ };
+
+ lhs_futures[blockAId] =
+ this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+ }
+
+ // now start kernels.
+ const Index m_base_start = m_block_idx * mc;
+ const bool need_to_pack = m_block_idx == 0;
+
+ for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
+ const Index n_start = n_block_idx * nc;
+ const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+
+ // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+ // we're going to start new k. In both cases need_to_pack is true.
+ if (need_to_pack) {
+ for (int i = num_blocks; i < num_threads; ++i) {
+ int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+ int future_id = (blockAId * n_blocks + n_block_idx);
+ wait_until_ready(&kernel_futures[future_id]);
+ }
+ }
+
+ packRKArg arg = {
+ &blockAs, // blockA
+ blockBs[n_block_idx], // blockB
+ rhs, // rhs
+ output, // output
+ m_base_start, // m
+ k_start, // k
+ n_start, // n
+ mc, // mc
+ actual_kc, // kc
+ actual_nc, // nc
+ num_threads,
+ numBlockAs,
+ m,
+ k_block_idx,
+ m_block_idx,
+ n_block_idx, // n_block_idx
+ m_blocks, // m_blocks
+ n_blocks, // n_blocks
+ &kernel_promises, // kernel_promises
+ &lhs_futures, // lhs_futures
+ need_to_pack, // need_to_pack
+ };
+
+ this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+ }
+ }
+ }
+
+ // Make sure all the kernels are done.
+ for (int i = 0; i < kernel_futures.size(); ++i) {
+ wait_until_ready(&kernel_futures[i]);
+ }
+
+ // deallocate all of the memory for both A and B's
+ for (int i = 0; i < blockAs.size(); i++) {
+ this->m_device.deallocate(blockAs[i]);
+ }
+ for (int i = 0; i < blockBs.size(); i++) {
+ this->m_device.deallocate(blockBs[i]);
+ }
+
+#undef CEIL_DIV
+ }
+
+ /*
+ * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
+ * the LHS block, check that all of the kernels that worked on the same
+ * mt_block_idx in the previous m_block are done.
+ */
+ template <typename packLArg, typename LhsPacker>
+ static void packLhs(const packLArg arg) {
+ // perform actual packing
+ LhsPacker pack_lhs;
+ pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
+ }
+
+ /*
+ * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
+ * all kernels in the previous block are done.
+ * Then for each LHS future, we wait on the future and then call GEBP
+ * on the area packed by the future (which starts at
+ * blockA + future_idx * mt * kc) on the LHS and with the full packed
+ * RHS block.
+ * The output of this GEBP is written to output(m + i * mt, n).
+ */
+ template <typename packRKArg, typename RhsPacker, typename GebpKernel>
+ static void packRhsAndKernel(packRKArg arg) {
+ if (arg.need_to_pack) {
+ RhsPacker pack_rhs;
+ pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
+ }
+
+ GebpKernel gebp;
+ for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
+ const Index m_base_start = arg.m + arg.mc*mt_block_idx;
+ if (m_base_start < arg.max_m) {
+ int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+
+ wait_until_ready(&(*arg.lhs_futures)[blockAId]);
+ const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
+ gebp(arg.output.getSubMapper(m_base_start, arg.n),
+ (*arg.blockAs)[blockAId], arg.blockB,
+ actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+
+ const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
+ (*arg.kernel_promises)[set_idx].set_value();
+ }
+ }
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_THREADS
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 000000000..591fd2464
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,912 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+namespace internal {
+
+
+template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
+ public:
+ IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+ const array<Index, NumKernelDims>& indices) {
+
+ array<Index, NumDims> dimensions = input_dims;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = indices[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ dimensions[index] = result_dim;
+ }
+
+ array<Index, NumDims> inputStrides;
+ array<Index, NumDims> outputStrides;
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > 0) {
+ inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+ outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+ } else {
+ inputStrides[0] = 1;
+ outputStrides[0] = 1;
+ }
+ }
+
+ array<Index, NumDims> cudaInputDimensions;
+ array<Index, NumDims> cudaOutputDimensions;
+ array<Index, NumDims> tmp = dimensions;
+ array<Index, NumDims> ordering;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ ordering[i] = indices[i];
+ tmp[indices[i]] = -1;
+ cudaInputDimensions[i] = input_dims[ordering[i]];
+ cudaOutputDimensions[i] = dimensions[ordering[i]];
+ }
+ int written = NumKernelDims;
+ for (int i = 0; i < NumDims; ++i) {
+ if (tmp[i] >= 0) {
+ ordering[written] = i;
+ cudaInputDimensions[written] = input_dims[i];
+ cudaOutputDimensions[written] = dimensions[i];
+ ++written;
+ }
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ m_inputStrides[i] = inputStrides[ordering[i]];
+ m_outputStrides[i] = outputStrides[ordering[i]];
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > NumKernelDims) {
+ m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
+ m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
+ } else {
+ m_cudaInputStrides[i] = 1;
+ m_cudaOutputStrides[i] = 1;
+ }
+ }
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+ Index inputIndex = 0;
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_cudaInputStrides[d];
+ inputIndex += idx * m_inputStrides[d];
+ p -= idx * m_cudaInputStrides[d];
+ }
+ inputIndex += p * m_inputStrides[NumKernelDims];
+ return inputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+ Index outputIndex = 0;
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_cudaOutputStrides[d];
+ outputIndex += idx * m_outputStrides[d];
+ p -= idx * m_cudaOutputStrides[d];
+ }
+ outputIndex += p * m_outputStrides[NumKernelDims];
+ return outputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+ return i * m_inputStrides[0];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+ return i * m_outputStrides[0];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+ return i * m_inputStrides[0] + j*m_inputStrides[1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+ return i * m_outputStrides[0] + j * m_outputStrides[1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+ return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+ return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
+ }
+
+ private:
+ static const size_t NumDims = internal::array_size<InputDims>::value;
+ array<Index, NumDims> m_inputStrides;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_cudaInputStrides;
+ array<Index, NumDims> m_cudaOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename promote_storage_type<typename InputXprType::Scalar,
+ typename KernelXprType::Scalar>::ret Scalar;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+ typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<InputXprType>::Index,
+ typename traits<KernelXprType>::Index>::type Index;
+ typedef typename InputXprType::Nested LhsNested;
+ typedef typename KernelXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<InputXprType>::NumDimensions;
+ static const int Layout = traits<InputXprType>::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+ typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+ typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+ typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
+ typename KernelXprType::PacketReturnType>::ret PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+ : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Indices& indices() const { return m_indices; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename InputXprType::Nested>::type&
+ inputExpression() const { return m_input_xpr; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename KernelXprType::Nested>::type&
+ kernelExpression() const { return m_kernel_xpr; }
+
+ protected:
+ typename InputXprType::Nested m_input_xpr;
+ typename KernelXprType::Nested m_kernel_xpr;
+ const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<InputArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ // Only column major tensors are supported for now.
+ EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ m_inputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
+ }
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ if (i > 0) {
+ m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
+ } else {
+ m_kernelStride[0] = 1;
+ }
+ m_indexStride[i] = m_inputStride[index];
+ }
+
+ m_outputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
+ }
+ }
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ preloadKernel();
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ void evalTo(typename XprType::Scalar* buffer) {
+ evalSubExprsIfNeeded(NULL);
+ for (int i = 0; i < dimensions().TotalSize(); ++i) {
+ buffer[i] += coeff(i);
+ }
+ cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ CoeffReturnType result = CoeffReturnType(0);
+ convolve(firstInput(index), 0, NumKernelDims-1, result);
+ return result;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+ {
+ const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ Index indices[2] = {index, index+PacketSize-1};
+ Index startInputs[2] = {0, 0};
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ startInputs[0] += indices[0];
+ startInputs[1] += indices[1];
+
+ if (startInputs[1]-startInputs[0] == PacketSize-1) {
+ PacketReturnType result = internal::pset1<PacketReturnType>(0);
+ convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+ return result;
+ } else {
+ EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+ data[0] = Scalar(0);
+ convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+ for (int i = 1; i < PacketSize-1; ++i) {
+ data[i] = Scalar(0);
+ convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+ }
+ data[PacketSize-1] = Scalar(0);
+ convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+ return internal::pload<PacketReturnType>(data);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ startInput += index;
+ return startInput;
+ }
+
+ EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolve(input, kernel, DimIndex-1, accum);
+ } else {
+ accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+ }
+ }
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolvePacket(input, kernel, DimIndex-1, accum);
+ } else {
+ accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+ }
+ }
+ }
+
+ EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ array<Index, NumDims> m_inputStride;
+ array<Index, NumDims> m_outputStride;
+
+ array<Index, NumKernelDims> m_indexStride;
+ array<Index, NumKernelDims> m_kernelStride;
+ TensorEvaluator<InputArgType, Device> m_inputImpl;
+ TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+ Dimensions m_dimensions;
+
+ KernelArgType m_kernelArg;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+ const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+ return StaticKernelSize;
+ }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+ return kernelSize;
+ }
+};
+
+
+
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
+ extern __shared__ float s[];
+
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_plane = blockIdx.y * blockDim.y;
+ const int plane_stride = blockDim.y * gridDim.y;
+
+ for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+ // Load inputs to shared memory
+ const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = threadIdx.y * num_x_input;
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+ s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+ }
+
+ __syncthreads();
+
+ // Compute the convolution
+ const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ const int kernel_offset = plane_kernel_offset + i;
+ float result = 0.0f;
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+ result += s[k + kernel_offset] * kernel[k];
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+ buffer[tensor_index] = result;
+ }
+ __syncthreads();
+ }
+};
+
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
+ extern __shared__ float s[];
+
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_y = blockIdx.y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+ const int num_y_output = last_y - first_y + 1;
+
+ const int first_plane = blockIdx.z * blockDim.z;
+ const int plane_stride = blockDim.z * gridDim.z;
+
+ for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+
+ const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+ // Load inputs to shared memory
+ #pragma unroll
+ for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+ const int input_offset = num_x_input * (j + plane_kernel_offset);
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+ s[i + input_offset] = eval.coeff(tensor_index);
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ float result = 0.0f;
+ #pragma unroll
+ for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+ const int kernel_offset = kernelSizeX * l;
+ const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+ result += s[k + input_offset] * kernel[k + kernel_offset];
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+ buffer[tensor_index] = result;
+ }
+ }
+
+ __syncthreads();
+ }
+};
+
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
+ extern __shared__ float s[];
+
+ // Load inputs to shared memory
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + kernelSizeX;
+
+ const int first_y = blockIdx.y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + kernelSizeY;
+
+ const int first_z = blockIdx.z * maxZ;
+ const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+ const int num_z_input = last_z - first_z + kernelSizeZ;
+
+ for (int p = 0; p < numPlanes; ++p) {
+
+ const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = 0;
+
+ for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+ for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+ s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int num_z_output = last_z - first_z + 1;
+ const int num_y_output = last_y - first_y + 1;
+ const int num_x_output = last_x - first_x + 1;
+ const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+ for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+ for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ float result = 0.0f;
+ for (int n = 0; n < kernelSizeZ; ++n) {
+ for (int m = 0; m < kernelSizeY; ++m) {
+ for (int l = 0; l < kernelSizeX; ++l) {
+ result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+ }
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+ buffer[tensor_index] = result;
+ }
+ }
+ }
+ __syncthreads();
+ }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+ PacketAccess = false,
+ Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ // Only column major tensors are supported for now.
+ EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ preloadKernel();
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ executeEval(data);
+ return false;
+ } else {
+ m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+ executeEval(m_buf);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_buf) {
+ m_device.deallocate(m_buf);
+ m_buf = NULL;
+ }
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ static unsigned int ceil(unsigned int num, unsigned int denom) {
+ const unsigned int rounded_toward_zero = num / denom;
+ if (num > rounded_toward_zero * denom) {
+ return rounded_toward_zero + 1;
+ }
+ return rounded_toward_zero;
+ }
+
+ void executeEval(Scalar* data) const {
+ typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+ const int maxSharedMem = sharedMemPerBlock();
+ const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
+ const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+ const int numMultiProcessors = getNumCudaMultiProcessors();
+ const int warpSize = 32;
+
+ switch (NumKernelDims) {
+ case 1: {
+ const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+ const int numX = dimensions()[m_indices[0]];
+ const int numP = dimensions().TotalSize() / numX;
+
+ int maxX;
+ dim3 block_size;
+ if (m_indices[0] == 0) {
+ // Maximum the reuse
+ const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+ maxX = (std::min<int>)(inner_dim, numX);
+ const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+ block_size.x = (std::min)(maxThreadsPerBlock, maxX);
+ block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+ }
+ else {
+ // Read as much as possible alongside the inner most dimension, that is the plane
+ const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+ const int maxP = (std::min<int>)(inner_dim, numP);
+ maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+ block_size.x = (std::min)(warpSize, maxX);
+ block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+ }
+
+ const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+ dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+ //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 1> indices(m_indices[0]);
+ const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+ internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ switch(kernel_size) {
+ case 4: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+ break;
+ }
+ case 7: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+ break;
+ }
+ default: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+ }
+ }
+ break;
+ }
+
+ case 2: {
+ const int kernel_size_x = m_kernelImpl.dimensions()[0];
+ const int kernel_size_y = m_kernelImpl.dimensions()[1];
+
+ const int numX = dimensions()[m_indices[0]];
+ const int numY = dimensions()[m_indices[1]];
+ const int numP = dimensions().TotalSize() / (numX*numY);
+
+ const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+ // Snap maxX to warp size
+ int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+ const int maxX = (std::min<int>)(inner_dim, numX);
+ const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+ const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+ dim3 block_size;
+ block_size.x = (std::min)(1024, maxX);
+ block_size.y = (std::min<int>)(1024/block_size.x, maxY);
+ block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+
+ const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int num_y_blocks = ceil(numY, maxY);
+ const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+ dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+ //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 2> indices(m_indices[0], m_indices[1]);
+ const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
+ internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ switch (kernel_size_x) {
+ case 4: {
+ switch (kernel_size_y) {
+ case 7: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+ break;
+ }
+ default: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ case 7: {
+ switch (kernel_size_y) {
+ case 4: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+ break;
+ }
+ default: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ default: {
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+
+ case 3: {
+ const int kernel_size_x = m_kernelImpl.dimensions()[0];
+ const int kernel_size_y = m_kernelImpl.dimensions()[1];
+ const int kernel_size_z = m_kernelImpl.dimensions()[2];
+
+ const int numX = dimensions()[m_indices[0]];
+ const int numY = dimensions()[m_indices[1]];
+ const int numZ = dimensions()[m_indices[2]];
+ const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+ const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+ const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+ const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+ dim3 block_size;
+ block_size.x = (std::min)(32, maxX);
+ block_size.y = (std::min)(32, maxY);
+ block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+ dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+ const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+ const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
+ const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
+ internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+ LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+ break;
+ }
+
+ default: {
+ assert(false && "not supported yet");
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return m_buf[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+ }
+
+ private:
+ // No assignment (copies are needed by the kernels)
+ TensorEvaluator& operator = (const TensorEvaluator&);
+
+ TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+ TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+ KernelArgType m_kernelArg;
+ Indices m_indices;
+ Dimensions m_dimensions;
+ Scalar* m_buf;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+
+ const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 000000000..649bdb308
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its argument
+ * on the specified computing 'device' (GPU, thread pool, ...)
+ *
+ * Example:
+ * C.device(EIGEN_GPU) = A + B;
+ *
+ * Todo: thread pools.
+ * Todo: operator +=, -=, *= and so on.
+ */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+ public:
+ TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ Assign assign(m_expression, other);
+ static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+ internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+ typedef typename OtherDerived::Scalar Scalar;
+ typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+ Sum sum(m_expression, other);
+ typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+ Assign assign(m_expression, sum);
+ static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+ internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+ return *this;
+ }
+
+ protected:
+ const DeviceType& m_device;
+ ExpressionType& m_expression;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
+ public:
+ TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ Assign assign(m_expression, other);
+ static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+ internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+ typedef typename OtherDerived::Scalar Scalar;
+ typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+ Sum sum(m_expression, other);
+ typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+ Assign assign(m_expression, sum);
+ static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+ internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+ return *this;
+ }
+
+ protected:
+ const ThreadPoolDevice& m_device;
+ ExpressionType& m_expression;
+};
+#endif
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
+{
+ public:
+ TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ Assign assign(m_expression, other);
+ internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+ typedef typename OtherDerived::Scalar Scalar;
+ typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+ Sum sum(m_expression, other);
+ typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+ Assign assign(m_expression, sum);
+ internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+ return *this;
+ }
+
+ protected:
+ const GpuDevice& m_device;
+ ExpressionType m_expression;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 000000000..efd207507
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ return 1;
+ }
+};
+
+
+// Multiple cpu cores
+// We should really use a thread pool here but first we need to find a portable thread pool library.
+#ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+typedef std::promise<void> Promise;
+
+static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) {
+ f->wait();
+}
+static EIGEN_STRONG_INLINE void get_when_ready(Future* f) {
+ f->get();
+}
+
+
+struct ThreadPoolDevice {
+ ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { }
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ return num_threads_;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+ return std::async(std::launch::async, f, args...);
+ }
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+ std::async(std::launch::async, f, args...);
+ }
+
+ private:
+ size_t num_threads_;
+};
+
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+ if (!m_devicePropInitialized) {
+ assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+ m_devicePropInitialized = true;
+ }
+}
+
+static inline int getNumCudaMultiProcessors() {
+ initializeDeviceProp();
+ return m_deviceProperties.multiProcessorCount;
+}
+static inline int maxCudaThreadsPerBlock() {
+ initializeDeviceProp();
+ return m_deviceProperties.maxThreadsPerBlock;
+}
+static inline int maxCudaThreadsPerMultiProcessor() {
+ initializeDeviceProp();
+ return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+ initializeDeviceProp();
+ return m_deviceProperties.sharedMemPerBlock;
+}
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+ cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+ assert(status == cudaSuccess);
+}
+
+struct GpuDevice {
+ // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
+ GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
+
+ EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+#ifndef __CUDA_ARCH__
+ void* result;
+ assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
+ assert(result != NULL);
+ return result;
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+ return NULL;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+#ifndef __CUDA_ARCH__
+ assert(buffer != NULL);
+ assert(cudaFree(buffer) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+ assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+ assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+ cudaStreamSynchronize(*stream_);
+ }
+
+ private:
+ // TODO: multigpu.
+ const cudaStream_t* stream_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+ assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 000000000..2ad52b2f9
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,380 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorDimensions
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode and store the dimensions of a Tensor.
+ *
+ * The Sizes class encodes as part of the type the number of dimensions and the
+ * sizes corresponding to each dimension. It uses no storage space since it is
+ * entirely known at compile time.
+ * The DSizes class is its dynamic sibling: the number of dimensions is known
+ * at compile time but the sizes are set during execution.
+ *
+ * \sa Tensor
+ */
+
+// Can't use std::pair on cuda devices
+template <typename Index> struct IndexPair {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
+ Index first;
+ Index second;
+};
+
+// Boilerplate code
+namespace internal {
+
+template<std::size_t n, typename Dimension> struct dget {
+ static const std::size_t value = get<n, Dimension>::value;
+};
+
+
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static inline Index run(array<Index, NumIndices> const& indices,
+ const Dimensions& dimensions)
+ {
+ return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+ dget<RowMajor ? n : (NumIndices - n - 1), Dimensions>::value *
+ fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static inline Index run(array<Index, NumIndices> const& indices,
+ const Dimensions&)
+ {
+ return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+ }
+};
+
+} // end namespace internal
+
+
+// Fixed size
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct Sizes : internal::numeric_list<std::size_t, Indices...> {
+ typedef internal::numeric_list<std::size_t, Indices...> Base;
+ static const std::size_t total_size = internal::arg_prod(Indices...);
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+ return Base::count;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
+ return internal::arg_prod(Indices...);
+ }
+
+ Sizes() { }
+ template <typename DenseIndex>
+ explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+ // todo: add assertion
+ }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template <typename... DenseIndex> Sizes(DenseIndex...) { }
+ explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
+ // todo: add assertion
+ }
+#endif
+
+ template <typename T> Sizes& operator = (const T& /*other*/) {
+ // add assertion failure if the size of other is different
+ return *this;
+ }
+
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
+ }
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
+ }
+};
+
+template <typename std::size_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
+ return Sizes<Indices...>::total_size;
+}
+
+#else
+
+template <std::size_t n>
+struct non_zero_size {
+ typedef internal::type2val<std::size_t, n> type;
+};
+template <>
+struct non_zero_size<0> {
+ typedef internal::null_type type;
+};
+
+template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+ typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
+ static const size_t count = Base::count;
+ static const std::size_t total_size = internal::arg_prod<Base>::value;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+ return count;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+ return internal::arg_prod<Base>::value;
+ }
+
+ Sizes() { }
+ template <typename DenseIndex>
+ explicit Sizes(const array<DenseIndex, Base::count>& indices) {
+ // todo: add assertion
+ }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
+ explicit Sizes(std::initializer_list<std::size_t> l) {
+ // todo: add assertion
+ }
+#else
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
+ }
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
+ }
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+ }
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+ }
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+ }
+#endif
+
+ template <typename T> Sizes& operator = (const T& other) {
+ // to do: check the size of other
+ return *this;
+ }
+
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this);
+ }
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this);
+ }
+};
+
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+ return Sizes<V1, V2, V3, V4, V5>::total_size;
+};
+
+#endif
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_index_linearization_helper
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
+ {
+ return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+ array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+ tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
+ {
+ return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+ }
+};
+} // end namespace internal
+
+
+
+// Dynamic size
+template <typename DenseIndex, std::size_t NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+ typedef array<DenseIndex, NumDims> Base;
+ static const std::size_t count = NumDims;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+ return NumDims;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
+ return internal::array_prod(*static_cast<const Base*>(this));
+ }
+
+ EIGEN_DEVICE_FUNC DSizes() {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = 0;
+ }
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+ }
+#else
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+ eigen_assert(NumDims == 1);
+ (*this)[0] = i0;
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+ eigen_assert(NumDims == 2);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+ eigen_assert(NumDims == 3);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+ eigen_assert(NumDims == 4);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ (*this)[3] = i3;
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+ eigen_assert(NumDims == 5);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ (*this)[3] = i3;
+ (*this)[4] = i4;
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+ *static_cast<Base*>(this) = other;
+ return *this;
+ }
+
+ // A constexpr would be so much better here
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+ return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+ return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+ }
+};
+
+
+
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
+ {
+ return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+ array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+ tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
+ {
+ return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+ }
+};
+} // end namespace internal
+
+
+namespace internal {
+
+template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+ static const size_t value = NumDims;
+};
+template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+ static const size_t value = NumDims;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<Indices...>&) {
+ return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+}
+#else
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+ static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+ static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
+ return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
+};
+
+#endif
+
+
+template <typename Dims1, typename Dims2, size_t n>
+struct sizes_match_up_to_dim {
+ static inline bool run(Dims1& dims1, Dims2& dims2) {
+ return (array_get<n>(dims1) == array_get<n>(dims2)) &
+ sizes_match_up_to_dim<Dims1, Dims2, n-1>::run(dims1, dims2);
+ }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_up_to_dim<Dims1, Dims2, 0> {
+ static inline bool run(Dims1& dims1, Dims2& dims2) {
+ return (array_get<0>(dims1) == array_get<0>(dims2));
+ }
+};
+
+} // end namespace internal
+
+
+template <typename Dims1, typename Dims2>
+bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+ if (internal::array_size<Dims1>::value != internal::array_size<Dims2>::value) {
+ return false;
+ }
+ return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1>::run(dims1, dims2);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 000000000..883e6cab1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename XprType>
+struct traits<TensorEvalToOp<XprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename XprType>
+struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorEvalToOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
+{
+ typedef TensorEvalToOp<XprType> type;
+};
+
+} // end namespace internal
+
+
+
+
+template<typename XprType>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr)
+ : m_xpr(expr), m_buffer(buffer) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ CoeffReturnType* m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
+{
+ typedef TensorEvalToOp<ArgType> XprType;
+ typedef typename ArgType::Scalar Scalar;
+ typedef typename ArgType::Packet Packet;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ m_buffer[i] = m_impl.coeff(i);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+ internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_buffer[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 000000000..d084880de
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,427 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor evaluator classes.
+ *
+ * These classes are responsible for the evaluation of the tensor expression.
+ *
+ * TODO: add support for more types of expressions, in particular expressions
+ * leading to lvalues (slicing, reshaping, etc...)
+ */
+
+// Generic evaluator
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Packet Packet;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename Derived::Packet PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+
+ // NumDimensions is -1 for variable dim tensors
+ static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+ internal::traits<Derived>::NumDimensions : 0;
+
+ enum {
+ IsAligned = Derived::IsAligned,
+ PacketAccess = Derived::PacketAccess,
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+ : m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions()), m_device(device)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+ if (dest) {
+ m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+ return false;
+ }
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_data);
+ return m_data[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ eigen_assert(m_data);
+ return m_data[index];
+ }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<Packet, LoadMode>(m_data + index);
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const Packet& x)
+ {
+ return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+ eigen_assert(m_data);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return m_data[m_dims.IndexOfColMajor(coords)];
+ } else {
+ return m_data[m_dims.IndexOfRowMajor(coords)];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
+ eigen_assert(m_data);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return m_data[m_dims.IndexOfColMajor(coords)];
+ } else {
+ return m_data[m_dims.IndexOfRowMajor(coords)];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+ protected:
+ Scalar* m_data;
+ Dimensions m_dims;
+ const Device& m_device;
+};
+
+
+// Default evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Packet Packet;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename Derived::Packet PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+
+ // NumDimensions is -1 for variable dim tensors
+ static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+ internal::traits<Derived>::NumDimensions : 0;
+
+ enum {
+ IsAligned = Derived::IsAligned,
+ PacketAccess = Derived::PacketAccess,
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&)
+ : m_data(m.data()), m_dims(m.dimensions())
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_data);
+#ifdef __CUDA_ARCH__
+ return __ldg(m_data+index);
+#else
+ return m_data[index];
+#endif
+ }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+ eigen_assert(m_data);
+ const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+ : m_dims.IndexOfRowMajor(coords);
+#ifdef __CUDA_ARCH__
+ return __ldg(m_data+index);
+#else
+ return m_data[index];
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+ protected:
+ const Scalar* m_data;
+ Dimensions m_dims;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+ typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename internal::traits<XprType>::Packet PacketReturnType;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(index);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const NullaryOp m_functor;
+ TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
+ typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_argImpl(op.nestedExpression(), device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename internal::traits<XprType>::Packet PacketReturnType;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_argImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_argImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_argImpl.coeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const UnaryOp m_functor;
+ TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+// -------------------- CwiseBinaryOp --------------------
+
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
+ typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+ internal::functor_traits<BinaryOp>::PacketAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_leftImpl(op.lhsExpression(), device),
+ m_rightImpl(op.rhsExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename internal::traits<XprType>::Packet PacketReturnType;
+ typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use right impl instead if right impl dimensions are known at compile time.
+ return m_leftImpl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const BinaryOp m_functor;
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+
+// -------------------- SelectOp --------------------
+
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
+ typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
+ TensorEvaluator<IfArgType>::PacketAccess*/,
+ Layout = TensorEvaluator<IfArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_condImpl(op.ifExpression(), device),
+ m_thenImpl(op.thenExpression(), device),
+ m_elseImpl(op.elseExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+ eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename internal::traits<XprType>::Packet PacketReturnType;
+ typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use then or else impl instead if they happen to be known at compile time.
+ return m_condImpl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_condImpl.evalSubExprsIfNeeded(NULL);
+ m_thenImpl.evalSubExprsIfNeeded(NULL);
+ m_elseImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_condImpl.cleanup();
+ m_thenImpl.cleanup();
+ m_elseImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+ {
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ internal::Selector<PacketSize> select;
+ for (Index i = 0; i < PacketSize; ++i) {
+ select.select[i] = m_condImpl.coeff(index+i);
+ }
+ return internal::pblend(select,
+ m_thenImpl.template packet<LoadMode>(index),
+ m_elseImpl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ TensorEvaluator<IfArgType, Device> m_condImpl;
+ TensorEvaluator<ThenArgType, Device> m_thenImpl;
+ TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..05ac9bd2f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ */
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+ static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
+class TensorExecutor
+{
+ public:
+ typedef typename Expression::Index Index;
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr, const Device& device = Device())
+ {
+ TensorEvaluator<Expression, Device> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ for (Index i = 0; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+ {
+ TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ const Index VectorizedSize = (size / PacketSize) * PacketSize;
+
+ for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ for (Index i = VectorizedSize; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess>
+struct EvalRange {
+ static void run(Evaluator evaluator, const Index first, const Index last) {
+ eigen_assert(last > first);
+ for (Index i = first; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+ static void run(Evaluator evaluator, const Index first, const Index last) {
+ eigen_assert(last > first);
+
+ Index i = first;
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ if (last - first > PacketSize) {
+ eigen_assert(first % PacketSize == 0);
+ Index lastPacket = last - (last % PacketSize);
+ for (; i < lastPacket; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ }
+
+ for (; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template<typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+ {
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ Evaluator evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+
+ static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+
+ int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+ const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Index i = 0;
+ std::vector<Future> results;
+ results.reserve(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize));
+ }
+
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size);
+ }
+
+ for (int i = 0; i < numblocks; ++i) {
+ get_when_ready(&results[i]);
+ }
+
+ }
+ evaluator.cleanup();
+ }
+};
+#endif
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
+
+ const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index step_size = blockDim.x * gridDim.x;
+
+ // Use the scalar path
+ for (Index i = first_index; i < size; i += step_size) {
+ eval.evalScalar(i);
+ }
+}
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_Vectorizable(Evaluator eval, Index size) {
+
+ const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index step_size = blockDim.x * gridDim.x;
+
+ // Use the vector path
+ const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const Index vectorized_step_size = step_size * PacketSize;
+ const Index vectorized_size = (size / PacketSize) * PacketSize;
+ for (Index i = first_index * PacketSize; i < vectorized_size;
+ i += vectorized_step_size) {
+ eval.evalPacket(i);
+ }
+ for (Index i = vectorized_size + first_index; i < size; i += step_size) {
+ eval.evalScalar(i);
+ }
+}
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+ static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, false>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const GpuDevice& device)
+ {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+ const int block_size = maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+ }
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, true>
+{
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const GpuDevice& device)
+ {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+ const int block_size = maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+ }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 000000000..b66b3ec2c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,300 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+namespace Eigen {
+
+/** \class TensorExpr
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor expression classes.
+ *
+ * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+ * This is typically used to generate constants.
+ *
+ * The TensorCwiseUnaryOp class represents an expression where a unary operator
+ * (e.g. cwiseSqrt) is applied to an expression.
+ *
+ * The TensorCwiseBinaryOp class represents an expression where a binary
+ * operator (e.g. addition) is applied to a lhs and a rhs expression.
+ *
+ */
+namespace internal {
+template<typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
+ : traits<XprType>
+{
+ typedef typename XprType::Packet Packet;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::Nested XprTypeNested;
+ typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+} // end namespace internal
+
+
+
+template<typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+ : m_xpr(xpr), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ nestedExpression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const NullaryOp& functor() const { return m_functor; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const NullaryOp m_functor;
+};
+
+
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
+ : traits<XprType>
+{
+ // TODO(phli): Add InputScalar, InputPacket. Check references to
+ // current Scalar/Packet to see if the intent is Input or Output.
+ typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename XprType::Nested XprTypeNested;
+ typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template<typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
+{
+ typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
+{
+ public:
+ // TODO(phli): Add InputScalar, InputPacket. Check references to
+ // current Scalar/Packet to see if the intent is Input or Output.
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+ : m_xpr(xpr), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const UnaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expression */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ nestedExpression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const UnaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs
+ // are different.
+ // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
+ // current Scalar/Packet to see if the intent is Inputs or Output.
+ typedef typename result_of<
+ BinaryOp(typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar)>::type Scalar;
+ typedef traits<LhsXprType> XprTraits;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename promote_storage_type<
+ typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<
+ typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+ public:
+ // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
+ // current Scalar/Packet to see if the intent is Inputs or Output.
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const BinaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const BinaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+ : traits<ThenXprType>
+{
+ typedef typename traits<ThenXprType>::Scalar Scalar;
+ typedef traits<ThenXprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+ typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<ElseXprType>::Index,
+ typename traits<ThenXprType>::Index>::type Index;
+ typedef typename IfXprType::Nested IfNested;
+ typedef typename ThenXprType::Nested ThenNested;
+ typedef typename ElseXprType::Nested ElseNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
+{
+ typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
+{
+ typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorSelectOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+ typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::promote_storage_type<typename ThenXprType::PacketReturnType,
+ typename ElseXprType::PacketReturnType>::ret PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+ TensorSelectOp(const IfXprType& a_condition,
+ const ThenXprType& a_then,
+ const ElseXprType& a_else)
+ : m_condition(a_condition), m_then(a_then), m_else(a_else)
+ { }
+
+ const IfXprType& ifExpression() const { return m_condition; }
+
+ const ThenXprType& thenExpression() const { return m_then; }
+
+ const ElseXprType& elseExpression() const { return m_else; }
+
+ protected:
+ typename IfXprType::Nested m_condition;
+ typename ThenXprType::Nested m_then;
+ typename ElseXprType::Nested m_else;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 000000000..94b3f957b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The fixed sized version of the tensor class.
+ *
+ * The fixed sized equivalent of
+ * Eigen::Tensor<float, 3> t(3, 5, 7);
+ * is
+ * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+ */
+
+template<typename Scalar_, typename Dimensions_, int Options_>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> >
+{
+ public:
+ typedef TensorFixedSize<Scalar_, Dimensions_, Options_> Self;
+ typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<Self>::StorageKind StorageKind;
+ typedef typename internal::traits<Self>::Index Index;
+ typedef Scalar_ Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+
+ static const int Options = Options_;
+
+ enum {
+ IsAligned = bool(EIGEN_ALIGN),
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+ CoordAccess = true,
+ };
+
+ typedef Dimensions_ Dimensions;
+ static const std::size_t NumIndices = Dimensions::count;
+
+ protected:
+ TensorStorage<Scalar, NumIndices, Dimensions::total_size, Options, Dimensions> m_storage;
+
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
+
+ // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+ // work, because that uses base().coeffRef() - and we don't yet
+ // implement a similar class hierarchy
+ inline Self& base() { return *this; }
+ inline const Self& base() const { return *this; }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+ {
+ eigen_assert(checkIndexRange(indices));
+ return coeff(indices);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead.
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeff(index);
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+ {
+ eigen_assert(checkIndexRange(indices));
+ return coeffRef(indices);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+ {
+ eigen_assert(index >= 0 && index < size());
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize()
+ : m_storage()
+ {
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
+ : m_storage(other.m_storage)
+ {
+ }
+
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+ inline TensorFixedSize(Self&& other)
+ : m_storage(other.m_storage)
+ {
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
+ {
+ // FIXME: check that the dimensions of other match the dimensions of *this.
+ // Unfortunately this isn't possible yet when the rhs is an expression.
+ typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
+ {
+ // FIXME: check that the dimensions of other match the dimensions of *this.
+ // Unfortunately this isn't possible yet when the rhs is an expression.
+ typedef TensorAssignOp<Self, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
+ {
+ using internal::array_apply_and_reduce;
+ using internal::array_zip_and_reduce;
+ using internal::greater_equal_zero_op;
+ using internal::logical_and_op;
+ using internal::lesser_op;
+
+ return true;
+ // check whether the indices are all >= 0
+ /* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+ // check whether the indices fit in the dimensions
+ array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+ {
+ if (Options&RowMajor) {
+ return m_storage.dimensions().IndexOfRowMajor(indices);
+ } else {
+ return m_storage.dimensions().IndexOfColMajor(indices);
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 000000000..41a36cb75
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,151 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename traits<XprType>::StorageKind StorageKind;
+ typedef typename traits<XprType>::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0,
+ };
+};
+
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+{
+ typedef TensorForcedEvalOp<XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+ : m_xpr(expr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+{
+ typedef TensorForcedEvalOp<ArgType> XprType;
+ typedef typename ArgType::Scalar Scalar;
+ typedef typename ArgType::Packet Packet;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ const Index numValues = m_impl.dimensions().TotalSize();
+ m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+ // Should initialize the memory in case we're dealing with non POD types.
+ if (!internal::is_arithmetic<CoeffReturnType>::value) {
+ for (Index i = 0; i < numValues; ++i) {
+ new(m_buffer+i) CoeffReturnType();
+ }
+ }
+ typedef TensorEvalToOp<const ArgType> EvalTo;
+ EvalTo evalToTmp(m_buffer, m_op);
+ internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+ m_impl.cleanup();
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_device.deallocate(m_buffer);
+ m_buffer = NULL;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_buffer[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
+
+ private:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const ArgType m_op;
+ const Device& m_device;
+ CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 000000000..7bec2b10a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+namespace Eigen {
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
+template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
+template<typename PlainObjectType> class TensorRef;
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+
+template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
+template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
+template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename PatchDim, typename XprType> class TensorPatchOp;
+template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<DenseIndex DimId, typename XprType> class TensorChippingOp;
+template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename XprType> class TensorLayoutSwapOp;
+template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
+template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
+template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
+template<typename Shuffle, typename XprType> class TensorShufflingOp;
+template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+
+template<typename XprType> class TensorEvalToOp;
+template<typename XprType> class TensorForcedEvalOp;
+
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
+
+namespace internal {
+template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 000000000..38586d067
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ (*accum) += t;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = padd<Packet>(*accum, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return static_cast<T>(0);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(0);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ return saccum + predux(vaccum);
+ }
+};
+
+template <typename T> struct MeanReducer
+{
+ static const bool PacketAccess = true;
+ MeanReducer() : scalarCount_(0), packetCount_(0) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+ (*accum) += t;
+ scalarCount_++;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+ (*accum) = padd<Packet>(*accum, p);
+ packetCount_++;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return static_cast<T>(0);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(0);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum / scalarCount_;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return pdiv(vaccum, pset1<Packet>(packetCount_));
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size);
+ }
+
+ protected:
+ int scalarCount_;
+ int packetCount_;
+};
+
+template <typename T> struct MaxReducer
+{
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ if (t > *accum) { *accum = t; }
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = pmax<Packet>(*accum, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return -(std::numeric_limits<T>::max)();
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(-(std::numeric_limits<T>::max)());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ return (std::max)(saccum, predux_max(vaccum));
+ }
+};
+
+template <typename T> struct MinReducer
+{
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ if (t < *accum) { *accum = t; }
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = pmin<Packet>(*accum, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return (std::numeric_limits<T>::max)();
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>((std::numeric_limits<T>::max)());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ return (std::min)(saccum, predux_min(vaccum));
+ }
+};
+
+
+template <typename T> struct ProdReducer
+{
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ (*accum) *= t;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = pmul<Packet>(*accum, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return static_cast<T>(1);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(1);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ return saccum * predux_mul(vaccum);
+ }
+};
+
+#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
+// We're not compiling a cuda kernel
+template <typename T> struct UniformRandomGenerator {
+
+ static const bool PacketAccess = true;
+
+ template<typename Index>
+ T operator()(Index, Index = 0) const {
+ return random<T>();
+ }
+ template<typename Index>
+ typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
+ const int packetSize = internal::packet_traits<T>::size;
+ EIGEN_ALIGN_DEFAULT T values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = random<T>();
+ }
+ return internal::pload<typename internal::packet_traits<T>::type>(values);
+ }
+};
+
+#else
+
+// We're compiling a cuda kernel
+template <typename T> struct UniformRandomGenerator;
+
+template <> struct UniformRandomGenerator<float> {
+
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ curand_init(0, tid, 0, &m_state);
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC
+ float operator()(Index, Index = 0) const {
+ return curand_uniform(&m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ float4 packetOp(Index, Index = 0) const {
+ return curand_uniform4(&m_state);
+ }
+
+ private:
+ mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> struct UniformRandomGenerator<double> {
+
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ curand_init(0, tid, 0, &m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ double operator()(Index, Index = 0) const {
+ return curand_uniform_double(&m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ double2 packetOp(Index, Index = 0) const {
+ return curand_uniform2_double(&m_state);
+ }
+
+ private:
+ mutable curandStatePhilox4_32_10_t m_state;
+};
+
+#endif
+
+
+#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711
+// We're not compiling a cuda kernel
+template <typename T> struct NormalRandomGenerator {
+
+ static const bool PacketAccess = true;
+
+ NormalRandomGenerator() : m_distribution(0, 1) {}
+ NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { }
+
+ template<typename Index>
+ T operator()(Index, Index = 0) const {
+ return m_distribution(m_generator);
+ }
+ template<typename Index>
+ typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
+ const int packetSize = internal::packet_traits<T>::size;
+ EIGEN_ALIGN_DEFAULT T values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = m_distribution(m_generator);
+ }
+ return internal::pload<typename internal::packet_traits<T>::type>(values);
+ }
+
+ mutable std::normal_distribution<T> m_distribution;
+ mutable std::default_random_engine m_generator;
+};
+
+#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// We're compiling a cuda kernel
+template <typename T> struct NormalRandomGenerator;
+
+template <> struct NormalRandomGenerator<float> {
+
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ curand_init(0, tid, 0, &m_state);
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC
+ float operator()(Index, Index = 0) const {
+ return curand_normal(&m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ float4 packetOp(Index, Index = 0) const {
+ return curand_normal4(&m_state);
+ }
+
+ private:
+ mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> struct NormalRandomGenerator<double> {
+
+ static const bool PacketAccess = true;
+
+ EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ curand_init(0, tid, 0, &m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ double operator()(Index, Index = 0) const {
+ return curand_normal_double(&m_state);
+ }
+ template<typename Index> EIGEN_DEVICE_FUNC
+ double2 packetOp(Index, Index = 0) const {
+ return curand_normal2_double(&m_state);
+ }
+
+ private:
+ mutable curandStatePhilox4_32_10_t m_state;
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 000000000..a9d0f6c39
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+namespace Eigen {
+
+namespace internal {
+template<>
+struct significant_decimals_impl<std::string>
+ : significant_decimals_default_impl<std::string, true>
+{};
+}
+
+
+template <typename T>
+std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+ // Evaluate the expression if needed
+ TensorForcedEvalOp<const T> eval = expr.eval();
+ TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+ tensor.evalSubExprsIfNeeded(NULL);
+
+ typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
+ typedef typename T::Index Index;
+ typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+
+ // Print the tensor as a 1d vector or a 2d matrix.
+ if (internal::array_size<Dimensions>::value == 1) {
+ Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+ os << array;
+ } else {
+ const Index first_dim = tensor.dimensions()[0];
+ static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
+ Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+ os << matrix;
+ }
+
+ // Cleanup.
+ tensor.cleanup();
+ return os;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 000000000..bf0e7edfb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for image processing.
+ * This assumes that the input has a least 3 dimensions ordered as follow:
+ * 1st dimension: channels (of size d)
+ * 2nd dimension: rows (of size r)
+ * 3rd dimension: columns (of size c)
+ * There can be additional dimensions such as time (for video) or batch (for
+ * bulk processing after the first 3.
+ * Calling the image patch code with patch_rows and patch_cols is equivalent
+ * to calling the regular patch extraction code with parameters d, patch_rows,
+ * patch_cols, and 1 for all the additional dimensions.
+ */
+namespace internal {
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions + 1;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
+{
+ typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
+{
+ typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+} // end namespace internal
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex row_strides, DenseIndex col_strides,
+ PaddingType padding_type)
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_padding_type(padding_type) {}
+
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_rows() const { return m_patch_rows; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_cols() const { return m_patch_cols; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex row_strides() const { return m_row_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex col_strides() const { return m_col_strides; }
+ EIGEN_DEVICE_FUNC
+ PaddingType padding_type() const { return m_padding_type; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const DenseIndex m_patch_rows;
+ const DenseIndex m_patch_cols;
+ const DenseIndex m_row_strides;
+ const DenseIndex m_col_strides;
+ const PaddingType m_padding_type;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
+{
+ typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = NumDims == 5,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ // Only column major tensors are supported for now.
+ EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ // Caches a few variables.
+ m_inputRows = input_dims[1];
+ m_inputCols = input_dims[2];
+
+ m_row_strides = op.row_strides();
+ m_col_strides = op.col_strides();
+
+ // We only support same strides for both dimensions and square patches.
+ eigen_assert(m_row_strides == m_col_strides);
+
+ switch (op.padding_type()) {
+ case PADDING_VALID:
+ m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast<float>(m_row_strides));
+ m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast<float>(m_col_strides));
+ // Calculate the padding
+ m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
+ m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
+ break;
+ case PADDING_SAME:
+ m_outputRows = ceil(m_inputRows / static_cast<float>(m_row_strides));
+ m_outputCols = ceil(m_inputCols / static_cast<float>(m_col_strides));
+ // Calculate the padding
+ m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
+ m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
+ break;
+ default:
+ eigen_assert(false && "unexpected padding");
+ }
+
+ // Dimensions for result of extraction.
+ // 0: depth
+ // 1: patch_rows
+ // 2: patch_cols
+ // 3: number of patches
+ // 4 and beyond: anything else (such as batch).
+ m_dimensions[0] = input_dims[0];
+ m_dimensions[1] = op.patch_rows();
+ m_dimensions[2] = op.patch_cols();
+ m_dimensions[3] = m_outputRows * m_outputCols;
+ for (int i = 4; i < NumDims; ++i) {
+ m_dimensions[i] = input_dims[i-1];
+ }
+
+ // Strides for moving the patch in various dimensions.
+ m_colStride = m_dimensions[1];
+ m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+ m_otherStride = m_patchStride * m_dimensions[3];
+
+ // Strides for navigating through the input tensor.
+ m_rowInputStride = input_dims[0];
+ m_colInputStride = input_dims[0] * input_dims[1];
+ m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2];
+
+ // Fast representations of different variables.
+ m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+ m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+ m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+ // Number of patches in the width dimension.
+ m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+ m_fastDimZero = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Patch index corresponding to the passed in index.
+ const Index patchIndex = index / m_fastPatchStride;
+
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero;
+
+ // Other ways to index this element.
+ const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+ const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+ const Index colIndex = patch2DIndex / m_fastOutputRows;
+ const Index colOffset = patchOffset / m_fastColStride;
+
+ // Calculate col index in the input original tensor.
+ const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft;
+ if (inputCol < 0 || inputCol >= m_inputCols) {
+ return Scalar(0);
+ }
+ const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+ const Index rowOffset = patchOffset - colOffset * m_colStride;
+
+ // Calculate row index in the original input tensor.
+ const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop;
+ if (inputRow < 0 || inputRow >= m_inputRows) {
+ return Scalar(0);
+ }
+
+ const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
+
+ const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride;
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ const Index indices[2] = {index, index + packetSize - 1};
+ const Index patchIndex = indices[0] / m_fastPatchStride;
+ if (patchIndex != indices[1] / m_fastPatchStride) {
+ return packetWithPossibleZero(index);
+ }
+ const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+ eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero,
+ (indices[1] - patchIndex * m_patchStride) / m_fastDimZero};
+
+ const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+ eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+ const Index colIndex = patch2DIndex / m_fastOutputRows;
+ const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+ // Calculate col indices in the original input tensor.
+ const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
+ m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+ if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+ // all zeros
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+
+ if (inputCols[0] == inputCols[1]) {
+ const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+ const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+ eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+ // Calculate col indices in the original input tensor.
+ const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
+ m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+ if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+ // all zeros
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+
+ if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+ // no padding
+ const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
+ const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ }
+
+ return packetWithPossibleZero(index);
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ Index rowPaddingTop() const { return m_rowPaddingTop; }
+ Index colPaddingLeft() const { return m_colPaddingLeft; }
+ Index outputRows() const { return m_outputRows; }
+ Index outputCols() const { return m_outputCols; }
+ Index userRowStride() const { return m_row_strides; }
+ Index userColStride() const { return m_col_strides; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+ {
+ // Location of the first element of the patch.
+ // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
+ const Index patchIndex = coords[3];
+
+ array<Index, NumDims-1> inputCoords;
+ inputCoords[0] = coords[0]; // depth
+ inputCoords[1] = patchIndex / m_inputCols + coords[1] - m_rowPaddingTop;
+ inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft;
+ inputCoords[3] = coords[4]; // batch
+ // If the computed coordinates are outside the original image perimeter, return 0.
+ if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows ||
+ inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) {
+ return Scalar(0);
+ }
+ if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+ return m_impl.coeff(inputCoords);
+ } else {
+ Index inputIndex =
+ inputCoords[3] * m_patchInputStride +
+ inputCoords[2] * m_colInputStride +
+ inputCoords[1] * m_rowInputStride +
+ inputCoords[0];
+ return m_impl.coeff(inputIndex);
+ }
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ Dimensions m_dimensions;
+
+ Index m_otherStride;
+ Index m_patchStride;
+ Index m_colStride;
+ Index m_row_strides;
+ Index m_col_strides;
+ internal::TensorIntDivisor<Index> m_fastOtherStride;
+ internal::TensorIntDivisor<Index> m_fastPatchStride;
+ internal::TensorIntDivisor<Index> m_fastColStride;
+
+ Index m_rowInputStride;
+ Index m_colInputStride;
+ Index m_patchInputStride;
+
+ Index m_inputRows;
+ Index m_inputCols;
+
+ Index m_outputRows;
+ Index m_outputCols;
+
+ Index m_rowPaddingTop;
+ Index m_colPaddingLeft;
+
+ internal::TensorIntDivisor<Index> m_fastOutputRows;
+ internal::TensorIntDivisor<Index> m_fastDimZero;
+
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 000000000..eed0a9f05
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+#ifdef EIGEN_HAS_CONSTEXPR
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIndexList
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+ *
+ * The indices in the list can be known at compile time or at runtime. A mix
+ * of static and dynamic indices can also be provided if needed. The tensor
+ * code will attempt to take advantage of the indices that are known at
+ * compile time to optimize the code it generates.
+ *
+ * This functionality requires a c++11 compliant compiler. If your compiler
+ * is older you need to use arrays of indices instead.
+ *
+ * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+ *
+ * \sa Tensor
+ */
+
+template <DenseIndex n>
+struct type2index {
+ static const DenseIndex value = n;
+ constexpr operator DenseIndex() const { return n; }
+ void set(DenseIndex val) {
+ eigen_assert(val == n);
+ }
+};
+
+namespace internal {
+template <typename T>
+void update_value(T& val, DenseIndex new_val) {
+ val = new_val;
+}
+template <DenseIndex n>
+void update_value(type2index<n>& val, DenseIndex new_val) {
+ val.set(new_val);
+}
+
+template <typename T>
+struct is_compile_time_constant {
+ static constexpr bool value = false;
+};
+
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx>& > {
+ static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx>& > {
+ static constexpr bool value = true;
+};
+
+template <DenseIndex Idx>
+struct tuple_coeff {
+ template <typename... T>
+ static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+ return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ }
+ template <typename... T>
+ static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+ if (i == Idx) {
+ update_value(std::get<Idx>(t), value);
+ } else {
+ tuple_coeff<Idx-1>::set(i, t, value);
+ }
+ }
+
+ template <typename... T>
+ static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
+ return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
+ tuple_coeff<Idx-1>::value_known_statically(i, t);
+ }
+
+ template <typename... T>
+ static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
+ return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+ tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+ }
+
+ template <typename... T>
+ static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
+ return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+ is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
+ std::get<Idx>(t) > std::get<Idx-1>(t) &&
+ tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+ }
+};
+
+template <>
+struct tuple_coeff<0> {
+ template <typename... T>
+ static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+ // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
+ return std::get<0>(t) * (i == 0);
+ }
+ template <typename... T>
+ static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+ eigen_assert (i == 0);
+ update_value(std::get<0>(t), value);
+ }
+ template <typename... T>
+ static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) {
+ // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
+ return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
+ }
+
+ template <typename... T>
+ static constexpr bool values_up_to_known_statically(const std::tuple<T...>&) {
+ return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
+ }
+
+ template <typename... T>
+ static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>&) {
+ return true;
+ }
+};
+} // namespace internal
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexList : std::tuple<FirstType, OtherTypes...> {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
+ return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
+ return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+ }
+
+ constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
+ constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
+
+ constexpr bool value_known_statically(const DenseIndex i) const {
+ return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+ }
+ constexpr bool all_values_known_statically() const {
+ return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+ }
+
+ constexpr bool values_statically_known_to_increase() const {
+ return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+ }
+};
+
+
+template<typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+ return std::make_tuple(val1, other_vals...);
+}
+
+
+namespace internal {
+
+template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+ size_t result = 1;
+ for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+ result *= sizes[i];
+ }
+ return result;
+};
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+ return std::get<n>(a);
+}
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+ return std::get<n>(a);
+}
+
+template <typename T>
+struct index_known_statically {
+ constexpr bool operator() (DenseIndex) const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+ }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+ constexpr bool operator() () const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() () const {
+ return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() () const {
+ return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+ }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+ constexpr bool operator() () const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() () const {
+ return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() () const {
+ return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+ }
+};
+
+template <typename Tx>
+struct index_statically_eq {
+ constexpr bool operator() (DenseIndex, DenseIndex) const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] == value);
+ }
+};
+
+template <typename T>
+struct index_statically_ne {
+ constexpr bool operator() (DenseIndex, DenseIndex) const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] != value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] != value);
+ }
+};
+
+
+template <typename T>
+struct index_statically_gt {
+ constexpr bool operator() (DenseIndex, DenseIndex) const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] > value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] > value);
+ }
+};
+
+template <typename T>
+struct index_statically_lt {
+ constexpr bool operator() (DenseIndex, DenseIndex) const {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] < value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
+ constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>()[i] < value);
+ }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#else
+
+namespace Eigen {
+namespace internal {
+
+// No C++11 support
+template <typename T>
+struct index_known_statically {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
+ return false;
+ }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+ return false;
+ }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_eq {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_ne {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_gt {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_lt {
+ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+ return false;
+ }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 000000000..4303e3536
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+
+#include <initializer_list>
+
+namespace Eigen {
+
+/** \class TensorInitializer
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Helper template to initialize Tensors from std::initializer_lists.
+ */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+ typedef std::initializer_list<
+ typename Initializer<Derived, N - 1>::InitList> InitList;
+
+ static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+ const InitList& vals) {
+ int i = 0;
+ for (auto v : vals) {
+ (*indices)[traits<Derived>::NumDimensions - N] = i++;
+ Initializer<Derived, N - 1>::run(tensor, indices, v);
+ }
+ }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+ typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+ static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+ const InitList& vals) {
+ int i = 0;
+ // There is likely a faster way to do that than iterating.
+ for (auto v : vals) {
+ (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+ tensor.coeffRef(*indices) = v;
+ }
+ }
+};
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+ Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_HAS_VARIADIC_TEMPLATES
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 000000000..2714117ab
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIntDiv
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Fast integer division by a constant.
+ *
+ * See the paper from Granlund and Montgomery for explanation.
+ * (at http://dx.doi.org/10.1145/773473.178249)
+ *
+ * \sa Tensor
+ */
+
+namespace internal {
+
+template <typename T>
+struct TensorIntDivisor {
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+ multiplier = 0;
+ shift1 = 0;
+ shift2 = 0;
+ }
+
+ // Must have 1 <= divider <= 2^31-1
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+ const int N = 32;
+ eigen_assert(divider > 0);
+ eigen_assert(divider <= (1<<(N-1)) - 1);
+
+ // fast ln2
+#ifndef __CUDA_ARCH__
+ const int leading_zeros = __builtin_clz(divider);
+#else
+ const int leading_zeros = __clz(divider);
+#endif
+ const int log_div = N - (leading_zeros+1);
+
+ multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+ shift1 = log_div > 1 ? 1 : log_div;
+ shift2 = log_div > 1 ? log_div-1 : 0;
+ }
+
+ // Must have 0 <= numerator <= 2^32-1
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+ const int N = 32;
+ eigen_assert(numerator >= 0);
+ eigen_assert(numerator <= (1ull<<N) - 1);
+
+ uint32_t t1 = (multiplier * numerator) >> 32;
+ uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
+ return (t1 + t) >> shift2;
+ }
+
+ private:
+ uint64_t multiplier;
+ int32_t shift1;
+ int32_t shift2;
+};
+
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+ return divisor.divide(numerator);
+}
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 000000000..c119b30e2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Swap the layout from col-major to row-major, or row-major
+ * to col-major, and invert the order of the dimensions.
+ *
+ * Beware: the dimensions are reversed by this operation. If you want to
+ * preserve the ordering of the dimensions, you need to combine this
+ * operation with a shuffle.
+ *
+ * \example:
+ * Tensor<float, 2, ColMajor> input(2, 4);
+ * Tensor<float, 2, RowMajor> output = input.swap_layout();
+ * eigen_assert(output.dimension(0) == 4);
+ * eigen_assert(output.dimension(1) == 2);
+ *
+ * array<int, 2> shuffle(1, 0);
+ * output = input.swap_layout().shuffle(shuffle);
+ * eigen_assert(output.dimension(0) == 2);
+ * eigen_assert(output.dimension(1) == 4);
+ *
+ */
+namespace internal {
+template<typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = traits<XprType>::NumDimensions;
+ static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+};
+
+template<typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
+{
+ typedef TensorLayoutSwapOp<XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+ : m_xpr(expr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+ typedef TensorLayoutSwapOp<ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ for(int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
+ }
+ }
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ return m_impl.evalSubExprsIfNeeded(data);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ Dimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename ArgType, typename Device>
+ struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+ : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+ typedef TensorLayoutSwapOp<ArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(index);
+ }
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 000000000..2cb2bc7a6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,291 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+namespace Eigen {
+
+/** \class TensorMap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A tensor expression mapping an existing array of data.
+ *
+ */
+
+template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
+{
+ public:
+ typedef TensorMap<PlainObjectType, Options_> Self;
+ typedef typename PlainObjectType::Base Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+ typedef typename internal::traits<PlainObjectType>::Index Index;
+ typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+
+ /* typedef typename internal::conditional<
+ bool(internal::is_lvalue<PlainObjectType>::value),
+ Scalar *,
+ const Scalar *>::type
+ PointerType;*/
+ typedef Scalar* PointerType;
+ typedef PointerType PointerArgType;
+
+ static const int Options = Options_;
+
+ static const Index NumIndices = PlainObjectType::NumIndices;
+ typedef typename PlainObjectType::Dimensions Dimensions;
+
+ enum {
+ IsAligned = ((int(Options_)&Aligned)==Aligned),
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = PlainObjectType::Layout,
+ CoordAccess = true,
+ };
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+ EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+ EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+ EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+ EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#endif
+
+ inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+ : m_data(dataPtr), m_dimensions(dimensions)
+ { }
+
+ template <typename Dimensions>
+ EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+ : m_data(dataPtr), m_dimensions(dimensions)
+ { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar* data() { return m_data; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+ {
+ // eigen_assert(checkIndexRange(indices));
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(indices);
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(indices);
+ return m_data[index];
+ }
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return m_data[index];
+ }
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_data[index];
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i1 + i0 * m_dimensions[0];
+ return m_data[index];
+ } else {
+ const Index index = i0 + i1 * m_dimensions[0];
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+ return m_data[index];
+ }
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+ {
+ // eigen_assert(checkIndexRange(indices));
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(indices);
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(indices);
+ return m_data[index];
+ }
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ {
+ static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+ const std::size_t NumDims = sizeof...(otherIndices) + 1;
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+ return m_data[index];
+ }
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_data[index];
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i1 + i0 * m_dimensions[0];
+ return m_data[index];
+ } else {
+ const Index index = i0 + i1 * m_dimensions[0];
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+ return m_data[index];
+ }
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
+ {
+ typedef TensorAssignOp<Self, const Self> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Self& operator=(const OtherDerived& other)
+ {
+ typedef TensorAssignOp<Self, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ private:
+ Scalar* m_data;
+ Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 000000000..a93f48ccb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,600 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+namespace Eigen {
+
+/** \class TensorReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<NewDimensions>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+};
+
+template<typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
+{
+ typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+ : m_xpr(expr), m_dims(dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const NewDimensions& dimensions() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
+ {
+ typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const NewDimensions m_dims;
+};
+
+
+// Eval as rvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+{
+ typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+ typedef NewDimensions Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_dimensions(op.dimensions())
+ {
+ // The total size of the reshaped tensor must be equal to the total size
+ // of the input tensor.
+ eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ return m_impl.evalSubExprsIfNeeded(data);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ NewDimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+ struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+ : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+ typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+ typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+ typedef NewDimensions Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(index);
+ }
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ }
+};
+
+
+/** \class TensorSlicing
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor slicing class.
+ *
+ *
+ */
+namespace internal {
+template<typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<StartIndices>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
+{
+ typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
+{
+ typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
+ : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+ EIGEN_DEVICE_FUNC
+ const StartIndices& startIndices() const { return m_indices; }
+ EIGEN_DEVICE_FUNC
+ const Sizes& sizes() const { return m_sizes; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
+ {
+ typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const StartIndices m_indices;
+ const Sizes m_sizes;
+};
+
+
+// Eval as rvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+ typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+ static const int NumDims = internal::array_size<Sizes>::value;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets and sizes.
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
+ {
+ for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const Sizes& output_dims = op.sizes();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ }
+
+ m_outputStrides[0] = 1;
+ m_fastOutputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+ }
+ } else {
+ m_inputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ }
+
+ m_outputStrides[NumDims-1] = 1;
+ m_fastOutputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+ }
+ }
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef Sizes Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ if (internal::is_arithmetic<Scalar>::value && data && m_impl.data()) {
+ Index contiguous_values = 1;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ contiguous_values *= dimensions()[i];
+ if (dimensions()[i] != m_impl.dimensions()[i]) {
+ break;
+ }
+ }
+ } else {
+ for (int i = NumDims-1; i >= 0; --i) {
+ contiguous_values *= dimensions()[i];
+ if (dimensions()[i] != m_impl.dimensions()[i]) {
+ break;
+ }
+ }
+ }
+ // Use memcpy if it's going to be faster than using the regular evaluation.
+ if (contiguous_values > 2 * m_device.numThreads()) {
+ Scalar* src = m_impl.data();
+ for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+ Index offset = srcCoeff(i);
+ m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+ }
+ return false;
+ }
+ }
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+ inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + m_offsets[0]);
+ inputIndices[1] += (indices[1] + m_offsets[0]);
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+ inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
+ inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+ for (int i = 1; i < packetSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
+ {
+ array<Index, NumDims> inputCoords;
+ for (int i = 0; i < NumDims; ++i) {
+ inputCoords = coords[i] + this->m_offsets[i];
+ }
+ return m_impl.coeff(inputCoords);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+ Scalar* result = m_impl.data();
+ if (result) {
+ Index offset = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ if (m_dimensions[i] != m_impl.dimensions()[i]) {
+ offset += m_offsets[i] * m_inputStrides[i];
+ for (int j = i+1; j < NumDims; ++j) {
+ if (m_dimensions[j] > 1) {
+ return NULL;
+ }
+ offset += m_offsets[j] * m_inputStrides[j];
+ }
+ break;
+ }
+ }
+ } else {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ if (m_dimensions[i] != m_impl.dimensions()[i]) {
+ offset += m_offsets[i] * m_inputStrides[i];
+ for (int j = i-1; j >= 0; --j) {
+ if (m_dimensions[j] > 1) {
+ return NULL;
+ }
+ offset += m_offsets[j] * m_inputStrides[j];
+ }
+ break;
+ }
+ }
+ }
+ return result + offset;
+ }
+ return NULL;
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += (index + m_offsets[0]);
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += (index + m_offsets[NumDims-1]);
+ }
+ return inputIndex;
+ }
+
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ Dimensions m_dimensions;
+ const StartIndices m_offsets;
+};
+
+
+// Eval as lvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+ : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+ typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+ static const int NumDims = internal::array_size<Sizes>::value;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef Sizes Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+ inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + this->m_offsets[0]);
+ inputIndices[1] += (indices[1] + this->m_offsets[0]);
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+ inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
+ inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+ }
+ else {
+ EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ this->m_impl.coeffRef(inputIndices[0]) = values[0];
+ this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+ for (int i = 1; i < packetSize-1; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
+ {
+ array<Index, NumDims> inputCoords;
+ for (int i = 0; i < NumDims; ++i) {
+ inputCoords = coords[i] + this->m_offsets[i];
+ }
+ return this->m_impl.coeffRef(inputCoords);
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 000000000..2a7dd45c0
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+namespace Eigen {
+
+/** \class TensorPadding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor padding class.
+ * At the moment only 0-padding is supported.
+ *
+ */
+namespace internal {
+template<typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
+{
+ typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims)
+ : m_xpr(expr), m_padding_dims(padding_dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const PaddingDimensions& padding() const { return m_padding_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const PaddingDimensions m_padding_dims;
+};
+
+
+// Eval as rvalue
+template<typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
+{
+ typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<PaddingDimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = true,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_padding(op.padding())
+ {
+ // Compute dimensions
+ m_dimensions = m_impl.dimensions();
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+ }
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+ } else {
+ m_inputStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
+ }
+ m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+ }
+ }
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(index < dimensions().TotalSize());
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ return Scalar(0);
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+ return Scalar(0);
+ }
+ inputIndex += (index - m_padding[0].first);
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i+1];
+ if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ return Scalar(0);
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i+1];
+ }
+ if (index < m_padding[NumDims-1].first ||
+ index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+ return Scalar(0);
+ }
+ inputIndex += (index - m_padding[NumDims-1].first);
+ }
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return packetColMajor(index);
+ }
+ return packetRowMajor(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+ {
+ Index inputIndex;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ const Index idx = coords[0];
+ if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
+ return Scalar(0);
+ }
+ inputIndex = idx - m_padding[0].first;
+ for (int i = 1; i < NumDims; ++i) {
+ const Index idx = coords[i];
+ if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ return Scalar(0);
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ }
+ } else {
+ const Index idx = coords[NumDims-1];
+ if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+ return Scalar(0);
+ }
+ inputIndex = idx - m_padding[NumDims-1].first;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ const Index idx = coords[i];
+ if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ return Scalar(0);
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ }
+ }
+ return m_impl.coeff(inputIndex);
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ const Index initialIndex = index;
+ Index inputIndex = 0;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index first = index;
+ const Index last = index + packetSize - 1;
+ const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+ const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+ const Index lastPaddedRight = m_outputStrides[i+1];
+
+ if (last < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ // all the coefficient are between the 2 padding zones.
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ else {
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+ }
+
+ const Index last = index + packetSize - 1;
+ const Index first = index;
+ const Index lastPaddedLeft = m_padding[0].first;
+ const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+ const Index lastPaddedRight = m_outputStrides[1];
+
+ if (last < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ // all the coefficient are between the 2 padding zones.
+ inputIndex += (index - m_padding[0].first);
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ const Index initialIndex = index;
+ Index inputIndex = 0;
+
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index first = index;
+ const Index last = index + packetSize - 1;
+ const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
+ const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
+ const Index lastPaddedRight = m_outputStrides[i];
+
+ if (last < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ // all the coefficient are between the 2 padding zones.
+ const Index idx = index / m_outputStrides[i+1];
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i+1];
+ }
+ else {
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+ }
+
+ const Index last = index + packetSize - 1;
+ const Index first = index;
+ const Index lastPaddedLeft = m_padding[NumDims-1].first;
+ const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+ const Index lastPaddedRight = m_outputStrides[NumDims-1];
+
+ if (last < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(Scalar(0));
+ }
+ else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ // all the coefficient are between the 2 padding zones.
+ inputIndex += (index - m_padding[NumDims-1].first);
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ Dimensions m_dimensions;
+ array<Index, NumDims+1> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ PaddingDimensions m_padding;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 000000000..8a42ab6b1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorPatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor patch class.
+ *
+ *
+ */
+namespace internal {
+template<typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions + 1;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
+{
+ typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template<typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
+{
+ typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+ : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const PatchDim& patch_dims() const { return m_patch_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const PatchDim m_patch_dims;
+};
+
+
+// Eval as rvalue
+template<typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
+{
+ typedef TensorPatchOp<PatchDim, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = true,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ // Only column major tensors are supported for now.
+ EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ Index num_patches = 1;
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const PatchDim& patch_dims = op.patch_dims();
+ for (int i = 0; i < NumDims-1; ++i) {
+ m_dimensions[i] = patch_dims[i];
+ num_patches *= (input_dims[i] - patch_dims[i] + 1);
+ }
+ m_dimensions[NumDims-1] = num_patches;
+
+ m_inputStrides[0] = 1;
+ m_patchStrides[0] = 1;
+ for (int i = 1; i < NumDims-1; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
+ }
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Find the location of the first element of the patch.
+ Index patchIndex = index / m_outputStrides[NumDims - 1];
+ // Find the offset of the element wrt the location of the first element.
+ Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1];
+
+ Index inputIndex = 0;
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx = patchIndex / m_patchStrides[i];
+ patchIndex -= patchIdx * m_patchStrides[i];
+ const Index offsetIdx = patchOffset / m_outputStrides[i];
+ patchOffset -= offsetIdx * m_outputStrides[i];
+ inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+ }
+ inputIndex += (patchIndex + patchOffset);
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ Index indices[2] = {index, index + packetSize - 1};
+ Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1],
+ indices[1] / m_outputStrides[NumDims - 1]};
+ Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1],
+ indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]};
+
+ Index inputIndices[2] = {0, 0};
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+ patchIndices[1] / m_patchStrides[i]};
+ patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+ patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+ const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
+ patchOffsets[1] / m_outputStrides[i]};
+ patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+ patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+ inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+ inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+ }
+ inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+ inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+ for (int i = 1; i < packetSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+ {
+ // Location of the first element of the patch.
+ const Index patchIndex = coords[NumDims - 1];
+
+ if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+ array<Index, NumDims-1> inputCoords;
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx = patchIndex / m_patchStrides[i];
+ patchIndex -= patchIdx * m_patchStrides[i];
+ const Index offsetIdx = coords[i];
+ inputCoords[i] = coords[i] + patchIdx;
+ }
+ inputCoords[0] = (patchIndex + coords[0]);
+ return m_impl.coeff(inputCoords);
+ }
+ else {
+ Index inputIndex = 0;
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx = patchIndex / m_patchStrides[i];
+ patchIndex -= patchIdx * m_patchStrides[i];
+ const Index offsetIdx = coords[i];
+ inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+ }
+ inputIndex += (patchIndex + coords[0]);
+ return m_impl.coeff(inputIndex);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims-1> m_inputStrides;
+ array<Index, NumDims-1> m_patchStrides;
+
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 000000000..de5747905
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,426 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+
+/** \class TensorReduction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reduction class.
+ *
+ */
+
+namespace internal {
+template<typename Op, typename Dims, typename XprType>
+struct traits<TensorReductionOp<Op, Dims, XprType> >
+ : traits<XprType>
+{
+ typedef typename traits<XprType>::Scalar Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename traits<XprType>::StorageKind StorageKind;
+ typedef typename traits<XprType>::Index Index;
+ typedef typename XprType::Nested Nested;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+{
+ typedef const TensorReductionOp<Op, Dims, XprType>& type;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+{
+ typedef TensorReductionOp<Op, Dims, XprType> type;
+};
+
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+ static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+ static const bool value = false;
+};
+
+#ifdef EIGEN_HAS_CONSTEXPR
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+ static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+ index_statically_eq<ReducedDims>()(0, 0) &&
+ index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+ static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+ index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value) &&
+ index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+ static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+ index_statically_gt<ReducedDims>()(0, 0);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+ static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+ index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+};
+#endif
+
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+ EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+ GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+ }
+ }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+ for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+ reducer.reduce(self.m_impl.coeff(input), accum);
+ }
+ }
+};
+
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimReducer {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ return reducer.finalize(accum);
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+ const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+ const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+ typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+ for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+ }
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ return reducer.finalizeBoth(accum, p);
+ }
+};
+
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimPreserver {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+ eigen_assert(false && "should never be called");
+ }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+ EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+ InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+ }
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+ for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+ }
+ }
+};
+
+} // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+ public:
+ typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+ { }
+ TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const XprType& expression() const { return m_expr; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dims& dims() const { return m_dims; }
+ const Op& reducer() const { return m_reducer; }
+
+ protected:
+ typename XprType::Nested m_expr;
+ const Dims m_dims;
+ const Op m_reducer;
+};
+
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+{
+ typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumReducedDims = internal::array_size<Dims>::value;
+ static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
+ typedef DSizes<Index, NumOutputDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
+ static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+ static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_reducer(op.reducer())
+ {
+ EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ // Bitmap indicating if an input dimension is reduced or not.
+ array<bool, NumInputDims> reduced;
+ for (int i = 0; i < NumInputDims; ++i) {
+ reduced[i] = false;
+ }
+ for (int i = 0; i < NumReducedDims; ++i) {
+ eigen_assert(op.dims()[i] >= 0);
+ eigen_assert(op.dims()[i] < NumInputDims);
+ reduced[op.dims()[i]] = true;
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ int outputIndex = 0;
+ int reduceIndex = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (reduced[i]) {
+ m_reducedDims[reduceIndex] = input_dims[i];
+ ++reduceIndex;
+ } else {
+ m_dimensions[outputIndex] = input_dims[i];
+ ++outputIndex;
+ }
+ }
+
+ // Precompute output strides.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumOutputDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ }
+ } else {
+ m_outputStrides[NumOutputDims - 1] = 1;
+ for (int i = NumOutputDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ }
+ }
+
+ // Precompute input strides.
+ array<Index, NumInputDims> input_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_strides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ input_strides[i] = input_strides[i-1] * input_dims[i-1];
+ }
+ } else {
+ input_strides[NumInputDims - 1] = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ outputIndex = 0;
+ reduceIndex = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (reduced[i]) {
+ m_reducedStrides[reduceIndex] = input_strides[i];
+ ++reduceIndex;
+ } else {
+ m_preservedStrides[outputIndex] = input_strides[i];
+ ++outputIndex;
+ }
+ }
+
+ // Special case for full reductions
+ if (NumInputDims == NumReducedDims) {
+ m_dimensions[0] = 1;
+ m_preservedStrides[0] = internal::array_prod(input_dims);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ Op reducer(m_reducer);
+ if (ReducingInnerMostDims) {
+ const Index num_values_to_reduce =
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+ return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
+ num_values_to_reduce, reducer);
+ } else {
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+ return reducer.finalize(accum);
+ }
+ }
+
+ // TODO(bsteiner): provide a more efficient implementation.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ if (ReducingInnerMostDims) {
+ const Index num_values_to_reduce =
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+ const Index firstIndex = firstInput(index);
+ for (Index i = 0; i < packetSize; ++i) {
+ Op reducer(m_reducer);
+ values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+ num_values_to_reduce, reducer);
+ }
+ } else if (PreservingInnerMostDims) {
+ const Index firstIndex = firstInput(index);
+ const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+ // TBD: extend this the the n innermost dimensions that we preserve.
+ if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+ Op reducer(m_reducer);
+ typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+ internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+ return reducer.finalizePacket(accum);
+ } else {
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ }
+ } else {
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+ template <int, typename, typename> friend struct internal::GenericDimReducer;
+ template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+ template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+
+ // Returns the Index in the input tensor of the first value that needs to be
+ // used to compute the reduction at output index "index".
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ if (ReducingInnerMostDims) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return index * m_preservedStrides[0];
+ } else {
+ return index * m_preservedStrides[NumOutputDims - 1];
+ }
+ }
+ // TBD: optimize the case where we preserve the innermost dimensions.
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ // This is index_i in the output tensor.
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[0];
+ } else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ // This is index_i in the output tensor.
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[NumOutputDims - 1];
+ }
+ return startInput;
+ }
+
+ // Dimensions of the output of the operation.
+ Dimensions m_dimensions;
+ // Precomputed strides for the output tensor.
+ array<Index, NumOutputDims> m_outputStrides;
+ // Subset of strides of the input tensor for the non-reduced dimensions.
+ // Indexed by output dimensions.
+ array<Index, NumOutputDims> m_preservedStrides;
+
+ // Subset of strides of the input tensor for the reduced dimensions.
+ // Indexed by reduced dimensions.
+ array<Index, NumReducedDims> m_reducedStrides;
+ // Size of the input dimensions that are reduced.
+ // Indexed by reduced dimensions.
+ array<Index, NumReducedDims> m_reducedDims;
+
+ // Evaluator for the input expression.
+ TensorEvaluator<ArgType, Device> m_impl;
+
+ // Operation to apply for computing the reduction.
+ Op m_reducer;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 000000000..0a87e67eb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,429 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+ TensorLazyBaseEvaluator() : m_refcount(0) { }
+ virtual ~TensorLazyBaseEvaluator() { }
+
+ virtual const Dimensions& dimensions() const = 0;
+ virtual const Scalar* data() const = 0;
+
+ virtual const Scalar coeff(DenseIndex index) const = 0;
+ virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+ void incrRefCount() { ++m_refcount; }
+ void decrRefCount() { --m_refcount; }
+ int refCount() const { return m_refcount; }
+
+ private:
+ // No copy, no assigment;
+ TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+ TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
+
+ int m_refcount;
+};
+
+static char dummy[8];
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+ // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+ typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+
+ TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) {
+ m_dims = m_impl.dimensions();
+ m_impl.evalSubExprsIfNeeded(NULL);
+ }
+ virtual ~TensorLazyEvaluatorReadOnly() {
+ m_impl.cleanup();
+ }
+
+ virtual const Dimensions& dimensions() const {
+ return m_dims;
+ }
+ virtual const Scalar* data() const {
+ return m_impl.data();
+ }
+
+ virtual const Scalar coeff(DenseIndex index) const {
+ return m_impl.coeff(index);
+ }
+ virtual Scalar& coeffRef(DenseIndex /*index*/) {
+ eigen_assert(false && "can't reference the coefficient of a rvalue");
+ return *reinterpret_cast<Scalar*>(dummy);
+ };
+
+ protected:
+ TensorEvaluator<Expr, Device> m_impl;
+ Dimensions m_dims;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+ typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+ typedef typename Base::Scalar Scalar;
+
+ TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
+ }
+ virtual ~TensorLazyEvaluatorWritable() {
+ }
+
+ virtual Scalar& coeffRef(DenseIndex index) {
+ return this->m_impl.coeffRef(index);
+ }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
+ TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+ TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
+ public:
+ typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
+ TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+ TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
+ typedef typename Base::Scalar Scalar;
+
+ TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
+ }
+ virtual ~TensorLazyEvaluator() {
+ }
+};
+
+} // namespace internal
+
+
+/** \class TensorRef
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A reference to a tensor expression
+ * The expression will be evaluated lazily (as much as possible).
+ *
+ */
+template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
+{
+ public:
+ typedef TensorRef<PlainObjectType> Self;
+ typedef typename PlainObjectType::Base Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+ typedef typename internal::traits<PlainObjectType>::Index Index;
+ typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+ typedef Scalar* PointerType;
+ typedef PointerType PointerArgType;
+
+ static const Index NumIndices = PlainObjectType::NumIndices;
+ typedef typename PlainObjectType::Dimensions Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ Layout = PlainObjectType::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
+ }
+
+ template <typename Expression>
+ EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+ m_evaluator->incrRefCount();
+ }
+
+ template <typename Expression>
+ EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
+ unrefEvaluator();
+ m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+ m_evaluator->incrRefCount();
+ return *this;
+ }
+
+ ~TensorRef() {
+ unrefEvaluator();
+ }
+
+ TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
+ eigen_assert(m_evaluator->refCount() > 0);
+ m_evaluator->incrRefCount();
+ }
+
+ TensorRef& operator = (const TensorRef& other) {
+ if (this != &other) {
+ unrefEvaluator();
+ m_evaluator = other.m_evaluator;
+ eigen_assert(m_evaluator->refCount() > 0);
+ m_evaluator->incrRefCount();
+ }
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
+ {
+ return m_evaluator->coeff(index);
+ }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+ const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+ return coeff(indices);
+ }
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+ {
+ const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+ const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+ return coeffRef(indices);
+ }
+#else
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
+ {
+ array<Index, 2> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
+ {
+ array<Index, 3> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ array<Index, 4> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ array<Index, 5> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ indices[4] = i4;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
+ {
+ array<Index, 2> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
+ {
+ array<Index, 3> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ array<Index, 4> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ array<Index, 5> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ indices[4] = i4;
+ return coeffRef(indices);
+ }
+#endif
+
+ template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
+ {
+ const Dimensions& dims = this->dimensions();
+ Index index = 0;
+ if (PlainObjectType::Options & RowMajor) {
+ index += indices[0];
+ for (int i = 1; i < NumIndices; ++i) {
+ index = index * dims[i] + indices[i];
+ }
+ } else {
+ index += indices[NumIndices-1];
+ for (int i = NumIndices-2; i >= 0; --i) {
+ index = index * dims[i] + indices[i];
+ }
+ }
+ return m_evaluator->coeff(index);
+ }
+ template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+ {
+ const Dimensions& dims = this->dimensions();
+ Index index = 0;
+ if (PlainObjectType::Options & RowMajor) {
+ index += indices[0];
+ for (int i = 1; i < NumIndices; ++i) {
+ index = index * dims[i] + indices[i];
+ }
+ } else {
+ index += indices[NumIndices-1];
+ for (int i = NumIndices-2; i >= 0; --i) {
+ index = index * dims[i] + indices[i];
+ }
+ }
+ return m_evaluator->coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+ {
+ return m_evaluator->coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ return m_evaluator->coeffRef(index);
+ }
+
+ private:
+ EIGEN_STRONG_INLINE void unrefEvaluator() {
+ if (m_evaluator) {
+ m_evaluator->decrRefCount();
+ if (m_evaluator->refCount() == 0) {
+ delete m_evaluator;
+ }
+ }
+ }
+
+ internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+
+// evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Packet Packet;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename Derived::Packet PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ Layout = TensorRef<Derived>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+ : m_ref(m)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_ref.coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ return m_ref.coeffRef(index);
+ }
+
+ Scalar* data() const { return m_ref.data(); }
+
+ protected:
+ TensorRef<Derived> m_ref;
+};
+
+
+// evaluator for lvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Packet Packet;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename Derived::Packet PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+
+ typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ return this->m_ref.coeffRef(index);
+ }
+};
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 000000000..ad21e966b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+namespace Eigen {
+
+/** \class TensorReverse
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reverse elements class.
+ *
+ */
+namespace internal {
+template<typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions,
+ XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+ typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
+{
+ typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+
+template<typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
+ XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+ StorageKind;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr,
+ const ReverseDimensions& reverse_dims)
+ : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const ReverseDimensions m_reverse_dims;
+};
+
+
+// Eval as rvalue
+template<typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
+{
+ typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<ReverseDimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+ const Device& device)
+ : m_impl(op.expression(), device), m_reverse(op.reverse())
+ {
+ // Compute strides
+ m_dimensions = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+ }
+ } else {
+ m_strides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+ }
+ }
+ }
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(index < dimensions().TotalSize());
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ Index idx = index / m_strides[i];
+ index -= idx * m_strides[i];
+ if (m_reverse[i]) {
+ idx = m_dimensions[i] - idx - 1;
+ }
+ inputIndex += idx * m_strides[i] ;
+ }
+ if (m_reverse[0]) {
+ inputIndex += (m_dimensions[0] - index - 1);
+ } else {
+ inputIndex += index;
+ }
+ return m_impl.coeff(inputIndex);
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ Index idx = index / m_strides[i];
+ index -= idx * m_strides[i];
+ if (m_reverse[i]) {
+ idx = m_dimensions[i] - idx - 1;
+ }
+ inputIndex += idx * m_strides[i] ;
+ }
+ if (m_reverse[NumDims-1]) {
+ inputIndex += (m_dimensions[NumDims-1] - index - 1);
+ } else {
+ inputIndex += index;
+ }
+ return m_impl.coeff(inputIndex);
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ // TODO(ndjaitly): write a better packing routine that uses
+ // local structure.
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
+ values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_strides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ ReverseDimensions m_reverse;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 000000000..1012ecd69
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+namespace Eigen {
+
+/** \class TensorShuffling
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor shuffling class.
+ *
+ *
+ */
+namespace internal {
+template<typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
+{
+ typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template<typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
+{
+ typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
+ : m_xpr(expr), m_shuffle(shuffle) {}
+
+ EIGEN_DEVICE_FUNC
+ const Shuffle& shuffle() const { return m_shuffle; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
+ {
+ typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Shuffle m_shuffle;
+};
+
+
+// Eval as rvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+ typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const Shuffle& shuffle = op.shuffle();
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] = input_dims[shuffle[i]];
+ }
+
+ array<Index, NumDims> inputStrides;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ inputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ }
+ } else {
+ inputStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ }
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ m_inputStrides[i] = inputStrides[shuffle[i]];
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ return inputIndex + index * m_inputStrides[0];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ return inputIndex + index * m_inputStrides[NumDims - 1];
+ }
+ }
+
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+ : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+ typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ for (int i = 0; i < packetSize; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index a34600ee6..1b227e8c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -30,39 +30,70 @@ namespace Eigen {
*
* \sa Tensor
*/
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+
+
+// Pure fixed-size storage
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
+class TensorStorage
+{
+ private:
+ EIGEN_ALIGN_DEFAULT T m_data[Size];
+ FixedDimensions m_dimensions;
+
+ public:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStorage() {
+ EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T *data() { return m_data; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
+
+
// pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
: public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
{
- typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
+ typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
+
public:
- TensorStorage() = default;
- TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
- TensorStorage(TensorStorage<T, NumIndices_, Dynamic, Options_, void>&&) = default;
+ TensorStorage() { }
+ TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { }
+
TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {}
- TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
- TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
+ TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
+
+ // TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
};
// pure dynamic
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
{
T *m_data;
- std::array<DenseIndex, NumIndices_> m_dimensions;
+ DSizes<DenseIndex, NumIndices_> m_dimensions;
typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Self_;
public:
- TensorStorage() : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
+ TensorStorage() : m_data(0), m_dimensions() {}
TensorStorage(internal::constructor_without_unaligned_array_assert)
: m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
- TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions)
- : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
- { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
- TensorStorage(const Self_& other)
+ TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions)
+ : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
+ { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
+ TensorStorage(const Self_& other)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
, m_dimensions(other.m_dimensions)
{
@@ -76,32 +107,19 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
}
return *this;
}
- TensorStorage(Self_&& other)
- : m_data(std::move(other.m_data)), m_dimensions(std::move(other.m_dimensions))
- {
- other.m_data = nullptr;
- }
- Self_& operator=(Self_&& other)
- {
- using std::swap;
- swap(m_data, other.m_data);
- swap(m_dimensions, other.m_dimensions);
- return *this;
- }
+
~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
void swap(Self_& other)
{ std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
- std::array<DenseIndex, NumIndices_> dimensions(void) const {return m_dimensions;}
- void conservativeResize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
- {
- m_data = internal::conditional_aligned_realloc_new_auto<T,(Options_&DontAlign)==0>(m_data, size, internal::array_prod(m_dimensions));
- m_dimensions = nbDimensions;
- }
- void resize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
+
+ EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
{
- if(size != internal::array_prod(m_dimensions))
+ const DenseIndex currentSz = internal::array_prod(m_dimensions);
+ if(size != currentSz)
{
- internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions));
+ internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
if (size)
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
else
@@ -110,16 +128,13 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
}
m_dimensions = nbDimensions;
}
- const T *data() const { return m_data; }
- T *data() { return m_data; }
-};
-// TODO: implement fixed-size stuff
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 000000000..00cb8e373
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+namespace Eigen {
+
+/** \class TensorStriding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor striding class.
+ *
+ *
+ */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorStridingOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
+{
+ typedef TensorStridingOp<Strides, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+ typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+ : m_xpr(expr), m_dims(dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const Strides& strides() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
+ {
+ typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Strides m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+ typedef TensorStridingOp<Strides, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ m_dimensions = m_impl.dimensions();
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_inputStrides[i-1] *= op.strides()[i-1];
+ }
+ m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
+ } else { // RowMajor
+ m_outputStrides[NumDims-1] = 1;
+ m_inputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_inputStrides[i+1] *= op.strides()[i+1];
+ }
+ m_inputStrides[0] *= op.strides()[0];
+ }
+ }
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStrides[i];
+ const Index idx1 = indices[1] / m_outputStrides[i];
+ inputIndices[0] += idx0 * m_inputStrides[i];
+ inputIndices[1] += idx1 * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * m_inputStrides[0];
+ inputIndices[1] += indices[1] * m_inputStrides[0];
+ } else { // RowMajor
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_outputStrides[i];
+ const Index idx1 = indices[1] / m_outputStrides[i];
+ inputIndices[0] += idx0 * m_inputStrides[i];
+ inputIndices[1] += idx1 * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
+ inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+ for (int i = 1; i < packetSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += index * m_inputStrides[0];
+ } else { // RowMajor
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += index * m_inputStrides[NumDims-1];
+ }
+ return inputIndex;
+ }
+
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+ : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+ typedef TensorStridingOp<Strides, ArgType> XprType;
+ typedef TensorEvaluator<const XprType, Device> Base;
+ // typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ // typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device) { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::PacketReturnType PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / this->m_outputStrides[i];
+ const Index idx1 = indices[1] / this->m_outputStrides[i];
+ inputIndices[0] += idx0 * this->m_inputStrides[i];
+ inputIndices[1] += idx1 * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * this->m_inputStrides[0];
+ inputIndices[1] += indices[1] * this->m_inputStrides[0];
+ } else { // RowMajor
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / this->m_outputStrides[i];
+ const Index idx1 = indices[1] / this->m_outputStrides[i];
+ inputIndices[0] += idx0 * this->m_inputStrides[i];
+ inputIndices[1] += idx1 * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
+ inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+ }
+ else {
+ EIGEN_ALIGN_DEFAULT Scalar values[packetSize];
+ internal::pstore<Scalar, PacketReturnType>(values, x);
+ this->m_impl.coeffRef(inputIndices[0]) = values[0];
+ this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+ for (int i = 1; i < packetSize-1; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 000000000..a844a4d68
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,256 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+namespace Eigen {
+namespace internal {
+
+
+template<typename Scalar, int Options>
+class compute_tensor_flags
+{
+ enum {
+ is_dynamic_size_storage = 1,
+
+ aligned_bit =
+ (
+ ((Options&DontAlign)==0) && (
+#if EIGEN_ALIGN_STATICALLY
+ (!is_dynamic_size_storage)
+#else
+ 0
+#endif
+ ||
+#if EIGEN_ALIGN
+ is_dynamic_size_storage
+#else
+ 0
+#endif
+ )
+ ) ? AlignedBit : 0,
+ packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+ };
+
+ public:
+ enum { ret = packet_access_bit | aligned_bit};
+};
+
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_> >
+{
+ typedef Scalar_ Scalar;
+ typedef Dense StorageKind;
+ typedef DenseIndex Index;
+ static const int NumDimensions = NumIndices_;
+ static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+ enum {
+ Options = Options_,
+ Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+ };
+};
+
+
+template<typename Scalar_, typename Dimensions, int Options_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
+{
+ typedef Scalar_ Scalar;
+ typedef Dense StorageKind;
+ typedef DenseIndex Index;
+ static const int NumDimensions = array_size<Dimensions>::value;
+ static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+ enum {
+ Options = Options_,
+ Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+ };
+};
+
+
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
+ : public traits<PlainObjectType>
+{
+ typedef traits<PlainObjectType> BaseTraits;
+ typedef typename BaseTraits::Scalar Scalar;
+ typedef typename BaseTraits::StorageKind StorageKind;
+ typedef typename BaseTraits::Index Index;
+ static const int NumDimensions = BaseTraits::NumDimensions;
+ static const int Layout = BaseTraits::Layout;
+ enum {
+ Options = Options_,
+ Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+ };
+};
+
+template<typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> >
+ : public traits<PlainObjectType>
+{
+ typedef traits<PlainObjectType> BaseTraits;
+ typedef typename BaseTraits::Scalar Scalar;
+ typedef typename BaseTraits::StorageKind StorageKind;
+ typedef typename BaseTraits::Index Index;
+ static const int NumDimensions = BaseTraits::NumDimensions;
+ static const int Layout = BaseTraits::Layout;
+ enum {
+ Options = BaseTraits::Options,
+ Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+ };
+};
+
+
+template<typename _Scalar, std::size_t NumIndices_, int Options>
+struct eval<Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+{
+ typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+};
+
+template<typename _Scalar, std::size_t NumIndices_, int Options>
+struct eval<const Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+{
+ typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+ typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+ typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
+{
+ typedef const TensorRef<PlainObjectType>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
+{
+ typedef const TensorRef<PlainObjectType>& type;
+};
+
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_> >
+{
+ typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+};
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_> >
+{
+ typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options> >
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options> >
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options> >
+{
+ typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options> >
+{
+ typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> >
+{
+ typedef const TensorRef<PlainObjectType>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> >
+{
+ typedef const TensorRef<PlainObjectType>& type;
+};
+
+} // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension. It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+// apply the filters. A larger stride means that the output will be
+// spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+// dimensions. This is usually used to make sure that the spatial
+// dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+// SAME: The pad value is computed so that the output will have size
+// R/S and C/S.
+// VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+// R' = ceil(float(R) / float(S))
+// C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function. The input tensor is padded with 0 as
+// needed. The number of padded rows and columns are computed as:
+// Pr = ((R' - 1) * S + K - R) / 2
+// Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+// R' = ceil(float(R - K + 1) / float(S))
+// C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+typedef enum {
+ PADDING_VALID = 1,
+ PADDING_SAME = 2,
+} PaddingType;
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H